]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
Merge tag 'mm-stable-2023-11-01-14-33' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 3 Nov 2023 05:38:47 +0000 (19:38 -1000)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 3 Nov 2023 05:38:47 +0000 (19:38 -1000)
Pull MM updates from Andrew Morton:
 "Many singleton patches against the MM code. The patch series which are
  included in this merge do the following:

   - Kemeng Shi has contributed some compation maintenance work in the
     series 'Fixes and cleanups to compaction'

   - Joel Fernandes has a patchset ('Optimize mremap during mutual
     alignment within PMD') which fixes an obscure issue with mremap()'s
     pagetable handling during a subsequent exec(), based upon an
     implementation which Linus suggested

   - More DAMON/DAMOS maintenance and feature work from SeongJae Park i
     the following patch series:

mm/damon: misc fixups for documents, comments and its tracepoint
mm/damon: add a tracepoint for damos apply target regions
mm/damon: provide pseudo-moving sum based access rate
mm/damon: implement DAMOS apply intervals
mm/damon/core-test: Fix memory leaks in core-test
mm/damon/sysfs-schemes: Do DAMOS tried regions update for only one apply interval

   - In the series 'Do not try to access unaccepted memory' Adrian
     Hunter provides some fixups for the recently-added 'unaccepted
     memory' feature. To increase the feature's checking coverage. 'Plug
     a few gaps where RAM is exposed without checking if it is
     unaccepted memory'

   - In the series 'cleanups for lockless slab shrink' Qi Zheng has done
     some maintenance work which is preparation for the lockless slab
     shrinking code

   - Qi Zheng has redone the earlier (and reverted) attempt to make slab
     shrinking lockless in the series 'use refcount+RCU method to
     implement lockless slab shrink'

   - David Hildenbrand contributes some maintenance work for the rmap
     code in the series 'Anon rmap cleanups'

   - Kefeng Wang does more folio conversions and some maintenance work
     in the migration code. Series 'mm: migrate: more folio conversion
     and unification'

   - Matthew Wilcox has fixed an issue in the buffer_head code which was
     causing long stalls under some heavy memory/IO loads. Some cleanups
     were added on the way. Series 'Add and use bdev_getblk()'

   - In the series 'Use nth_page() in place of direct struct page
     manipulation' Zi Yan has fixed a potential issue with the direct
     manipulation of hugetlb page frames

   - In the series 'mm: hugetlb: Skip initialization of gigantic tail
     struct pages if freed by HVO' has improved our handling of gigantic
     pages in the hugetlb vmmemmep optimizaton code. This provides
     significant boot time improvements when significant amounts of
     gigantic pages are in use

   - Matthew Wilcox has sent the series 'Small hugetlb cleanups' - code
     rationalization and folio conversions in the hugetlb code

   - Yin Fengwei has improved mlock()'s handling of large folios in the
     series 'support large folio for mlock'

   - In the series 'Expose swapcache stat for memcg v1' Liu Shixin has
     added statistics for memcg v1 users which are available (and
     useful) under memcg v2

   - Florent Revest has enhanced the MDWE (Memory-Deny-Write-Executable)
     prctl so that userspace may direct the kernel to not automatically
     propagate the denial to child processes. The series is named 'MDWE
     without inheritance'

   - Kefeng Wang has provided the series 'mm: convert numa balancing
     functions to use a folio' which does what it says

   - In the series 'mm/ksm: add fork-exec support for prctl' Stefan
     Roesch makes is possible for a process to propagate KSM treatment
     across exec()

   - Huang Ying has enhanced memory tiering's calculation of memory
     distances. This is used to permit the dax/kmem driver to use 'high
     bandwidth memory' in addition to Optane Data Center Persistent
     Memory Modules (DCPMM). The series is named 'memory tiering:
     calculate abstract distance based on ACPI HMAT'

   - In the series 'Smart scanning mode for KSM' Stefan Roesch has
     optimized KSM by teaching it to retain and use some historical
     information from previous scans

   - Yosry Ahmed has fixed some inconsistencies in memcg statistics in
     the series 'mm: memcg: fix tracking of pending stats updates
     values'

   - In the series 'Implement IOCTL to get and optionally clear info
     about PTEs' Peter Xu has added an ioctl to /proc/<pid>/pagemap
     which permits us to atomically read-then-clear page softdirty
     state. This is mainly used by CRIU

   - Hugh Dickins contributed the series 'shmem,tmpfs: general
     maintenance', a bunch of relatively minor maintenance tweaks to
     this code

   - Matthew Wilcox has increased the use of the VMA lock over
     file-backed page faults in the series 'Handle more faults under the
     VMA lock'. Some rationalizations of the fault path became possible
     as a result

   - In the series 'mm/rmap: convert page_move_anon_rmap() to
     folio_move_anon_rmap()' David Hildenbrand has implemented some
     cleanups and folio conversions

   - In the series 'various improvements to the GUP interface' Lorenzo
     Stoakes has simplified and improved the GUP interface with an eye
     to providing groundwork for future improvements

   - Andrey Konovalov has sent along the series 'kasan: assorted fixes
     and improvements' which does those things

   - Some page allocator maintenance work from Kemeng Shi in the series
     'Two minor cleanups to break_down_buddy_pages'

   - In thes series 'New selftest for mm' Breno Leitao has developed
     another MM self test which tickles a race we had between madvise()
     and page faults

   - In the series 'Add folio_end_read' Matthew Wilcox provides cleanups
     and an optimization to the core pagecache code

   - Nhat Pham has added memcg accounting for hugetlb memory in the
     series 'hugetlb memcg accounting'

   - Cleanups and rationalizations to the pagemap code from Lorenzo
     Stoakes, in the series 'Abstract vma_merge() and split_vma()'

   - Audra Mitchell has fixed issues in the procfs page_owner code's new
     timestamping feature which was causing some misbehaviours. In the
     series 'Fix page_owner's use of free timestamps'

   - Lorenzo Stoakes has fixed the handling of new mappings of sealed
     files in the series 'permit write-sealed memfd read-only shared
     mappings'

   - Mike Kravetz has optimized the hugetlb vmemmap optimization in the
     series 'Batch hugetlb vmemmap modification operations'

   - Some buffer_head folio conversions and cleanups from Matthew Wilcox
     in the series 'Finish the create_empty_buffers() transition'

   - As a page allocator performance optimization Huang Ying has added
     automatic tuning to the allocator's per-cpu-pages feature, in the
     series 'mm: PCP high auto-tuning'

   - Roman Gushchin has contributed the patchset 'mm: improve
     performance of accounted kernel memory allocations' which improves
     their performance by ~30% as measured by a micro-benchmark

   - folio conversions from Kefeng Wang in the series 'mm: convert page
     cpupid functions to folios'

   - Some kmemleak fixups in Liu Shixin's series 'Some bugfix about
     kmemleak'

   - Qi Zheng has improved our handling of memoryless nodes by keeping
     them off the allocation fallback list. This is done in the series
     'handle memoryless nodes more appropriately'

   - khugepaged conversions from Vishal Moola in the series 'Some
     khugepaged folio conversions'"

[ bcachefs conflicts with the dynamically allocated shrinkers have been
  resolved as per Stephen Rothwell in

     https://lore.kernel.org/all/20230913093553.4290421e@canb.auug.org.au/

  with help from Qi Zheng.

  The clone3 test filtering conflict was half-arsed by yours truly ]

* tag 'mm-stable-2023-11-01-14-33' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (406 commits)
  mm/damon/sysfs: update monitoring target regions for online input commit
  mm/damon/sysfs: remove requested targets when online-commit inputs
  selftests: add a sanity check for zswap
  Documentation: maple_tree: fix word spelling error
  mm/vmalloc: fix the unchecked dereference warning in vread_iter()
  zswap: export compression failure stats
  Documentation: ubsan: drop "the" from article title
  mempolicy: migration attempt to match interleave nodes
  mempolicy: mmap_lock is not needed while migrating folios
  mempolicy: alloc_pages_mpol() for NUMA policy without vma
  mm: add page_rmappable_folio() wrapper
  mempolicy: remove confusing MPOL_MF_LAZY dead code
  mempolicy: mpol_shared_policy_init() without pseudo-vma
  mempolicy trivia: use pgoff_t in shared mempolicy tree
  mempolicy trivia: slightly more consistent naming
  mempolicy trivia: delete those ancient pr_debug()s
  mempolicy: fix migrate_pages(2) syscall return nr_failed
  kernfs: drop shared NUMA mempolicy hooks
  hugetlbfs: drop shared NUMA mempolicy pretence
  mm/damon/sysfs-test: add a unit test for damon_sysfs_set_targets()
  ...

69 files changed:
1  2 
Documentation/admin-guide/cgroup-v2.rst
MAINTAINERS
arch/arm64/kernel/mte.c
arch/x86/include/asm/bitops.h
arch/x86/kvm/mmu/mmu.c
drivers/acpi/acpi_pad.c
drivers/firmware/efi/unaccepted_memory.c
drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/msm/msm_drv.c
drivers/gpu/drm/msm/msm_drv.h
drivers/gpu/drm/panfrost/panfrost_device.h
drivers/gpu/drm/panfrost/panfrost_drv.c
drivers/gpu/drm/panfrost/panfrost_gem.h
drivers/md/bcache/bcache.h
drivers/md/dm-cache-metadata.c
drivers/md/raid5.c
drivers/virtio/virtio_balloon.c
fs/bcachefs/btree_cache.c
fs/bcachefs/btree_key_cache.c
fs/bcachefs/btree_types.h
fs/bcachefs/fs.c
fs/bcachefs/sysfs.c
fs/btrfs/super.c
fs/erofs/utils.c
fs/ext4/ext4.h
fs/ext4/extents_status.c
fs/ext4/inode.c
fs/ext4/super.c
fs/f2fs/super.c
fs/gfs2/bmap.c
fs/gfs2/glock.c
fs/gfs2/quota.c
fs/hugetlbfs/inode.c
fs/iomap/buffered-io.c
fs/nfs/super.c
fs/nfsd/filecache.c
fs/nfsd/nfs4state.c
fs/ntfs3/file.c
fs/ocfs2/aops.c
fs/proc/task_mmu.c
fs/quota/dquot.c
fs/reiserfs/inode.c
fs/super.c
fs/ubifs/super.c
fs/ufs/inode.c
fs/xfs/xfs_buf.c
fs/xfs/xfs_buf.h
include/linux/cgroup-defs.h
include/linux/fs.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/sched.h
include/linux/sched/numa_balancing.h
kernel/cgroup/cgroup.c
kernel/exit.c
kernel/fork.c
kernel/rcu/tree.c
kernel/sched/fair.c
mm/mempolicy.c
mm/mmap.c
mm/nommu.c
mm/percpu.c
mm/shmem.c
mm/util.c
net/sunrpc/auth.c
tools/testing/selftests/clone3/clone3.c
tools/testing/selftests/damon/sysfs.sh
tools/testing/selftests/mm/mremap_test.c

diff --cc MAINTAINERS
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index 135278ddaf627bb1fd41ba6062a3596e52bd8f72,79ba576b22e3c9d0cf50f74f9e1197f95759b17a..3f2f7bf6e33526edeaa3a74288d0b14b79013aaf
@@@ -3,19 -3,12 +3,20 @@@
  #include <linux/efi.h>
  #include <linux/memblock.h>
  #include <linux/spinlock.h>
+ #include <linux/crash_dump.h>
  #include <asm/unaccepted_memory.h>
  
 -/* Protects unaccepted memory bitmap */
 +/* Protects unaccepted memory bitmap and accepting_list */
  static DEFINE_SPINLOCK(unaccepted_memory_lock);
  
 +struct accept_range {
 +      struct list_head list;
 +      unsigned long start;
 +      unsigned long end;
 +};
 +
 +static LIST_HEAD(accepting_list);
 +
  /*
   * accept_memory() -- Consult bitmap and accept the memory if needed.
   *
Simple merge
index 443bbc3ed75089110ef9f17f014e99be5209ff60,7f20249d60715b1676e885aa485c94f024fec7d8..2aae7d107f3356e08b55b6b05bf7cf96205318a0
@@@ -265,12 -457,23 +265,14 @@@ static int msm_drm_init(struct device *
        if (ret)
                goto err_deinit_vram;
  
-       msm_gem_shrinker_init(ddev);
 -      /* the fw fb could be anywhere in memory */
 -      ret = drm_aperture_remove_framebuffers(drv);
 -      if (ret)
 -              goto err_msm_uninit;
 -
+       ret = msm_gem_shrinker_init(ddev);
+       if (ret)
+               goto err_msm_uninit;
  
        if (priv->kms_init) {
 -              ret = priv->kms_init(ddev);
 -              if (ret) {
 -                      DRM_DEV_ERROR(dev, "failed to load kms\n");
 -                      priv->kms = NULL;
 +              ret = msm_drm_kms_init(dev, drv);
 +              if (ret)
                        goto err_msm_uninit;
 -              }
 -              kms = priv->kms;
        } else {
                /* valid only for the dummy headless case, where of_node=NULL */
                WARN_ON(dev->of_node);
Simple merge
index 1e85656dc2f7fe71e57d35ab64b672aa3ebd6883,e667e56893536709bea72867f135a5c1fdd735dd..1ef38f60d5dc4e96f2878d0e6b0ad4c16d1f382d
@@@ -119,14 -118,9 +119,14 @@@ struct panfrost_device 
  
        struct mutex shrinker_lock;
        struct list_head shrinker_list;
-       struct shrinker shrinker;
+       struct shrinker *shrinker;
  
        struct panfrost_devfreq pfdevfreq;
 +
 +      struct {
 +              atomic_t use_count;
 +              spinlock_t lock;
 +      } cycle_counter;
  };
  
  struct panfrost_mmu {
Simple merge
Simple merge
Simple merge
Simple merge
index 82cf243aa28830ef1720c994bd7261eebbb23cba,0000000000000000000000000000000000000000..5e585819190576db1f22ec9ec19b9b3322f1caf1
mode 100644,000000..100644
--- /dev/null
@@@ -1,1202 -1,0 +1,1204 @@@
-       struct bch_fs *c = container_of(shrink, struct bch_fs,
-                                       btree_cache.shrink);
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
 +#include "bkey_buf.h"
 +#include "btree_cache.h"
 +#include "btree_io.h"
 +#include "btree_iter.h"
 +#include "btree_locking.h"
 +#include "debug.h"
 +#include "errcode.h"
 +#include "error.h"
 +#include "trace.h"
 +
 +#include <linux/prefetch.h>
 +#include <linux/sched/mm.h>
 +
 +const char * const bch2_btree_node_flags[] = {
 +#define x(f)  #f,
 +      BTREE_FLAGS()
 +#undef x
 +      NULL
 +};
 +
 +void bch2_recalc_btree_reserve(struct bch_fs *c)
 +{
 +      unsigned i, reserve = 16;
 +
 +      if (!c->btree_roots_known[0].b)
 +              reserve += 8;
 +
 +      for (i = 0; i < btree_id_nr_alive(c); i++) {
 +              struct btree_root *r = bch2_btree_id_root(c, i);
 +
 +              if (r->b)
 +                      reserve += min_t(unsigned, 1, r->b->c.level) * 8;
 +      }
 +
 +      c->btree_cache.reserve = reserve;
 +}
 +
 +static inline unsigned btree_cache_can_free(struct btree_cache *bc)
 +{
 +      return max_t(int, 0, bc->used - bc->reserve);
 +}
 +
 +static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
 +{
 +      if (b->c.lock.readers)
 +              list_move(&b->list, &bc->freed_pcpu);
 +      else
 +              list_move(&b->list, &bc->freed_nonpcpu);
 +}
 +
 +static void btree_node_data_free(struct bch_fs *c, struct btree *b)
 +{
 +      struct btree_cache *bc = &c->btree_cache;
 +
 +      EBUG_ON(btree_node_write_in_flight(b));
 +
 +      clear_btree_node_just_written(b);
 +
 +      kvpfree(b->data, btree_bytes(c));
 +      b->data = NULL;
 +#ifdef __KERNEL__
 +      kvfree(b->aux_data);
 +#else
 +      munmap(b->aux_data, btree_aux_data_bytes(b));
 +#endif
 +      b->aux_data = NULL;
 +
 +      bc->used--;
 +
 +      btree_node_to_freedlist(bc, b);
 +}
 +
 +static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
 +                                 const void *obj)
 +{
 +      const struct btree *b = obj;
 +      const u64 *v = arg->key;
 +
 +      return b->hash_val == *v ? 0 : 1;
 +}
 +
 +static const struct rhashtable_params bch_btree_cache_params = {
 +      .head_offset    = offsetof(struct btree, hash),
 +      .key_offset     = offsetof(struct btree, hash_val),
 +      .key_len        = sizeof(u64),
 +      .obj_cmpfn      = bch2_btree_cache_cmp_fn,
 +};
 +
 +static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 +{
 +      BUG_ON(b->data || b->aux_data);
 +
 +      b->data = kvpmalloc(btree_bytes(c), gfp);
 +      if (!b->data)
 +              return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
 +#ifdef __KERNEL__
 +      b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
 +#else
 +      b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
 +                         PROT_READ|PROT_WRITE|PROT_EXEC,
 +                         MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
 +      if (b->aux_data == MAP_FAILED)
 +              b->aux_data = NULL;
 +#endif
 +      if (!b->aux_data) {
 +              kvpfree(b->data, btree_bytes(c));
 +              b->data = NULL;
 +              return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
 +      }
 +
 +      return 0;
 +}
 +
 +static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
 +{
 +      struct btree *b;
 +
 +      b = kzalloc(sizeof(struct btree), gfp);
 +      if (!b)
 +              return NULL;
 +
 +      bkey_btree_ptr_init(&b->key);
 +      INIT_LIST_HEAD(&b->list);
 +      INIT_LIST_HEAD(&b->write_blocked);
 +      b->byte_order = ilog2(btree_bytes(c));
 +      return b;
 +}
 +
 +struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 +{
 +      struct btree_cache *bc = &c->btree_cache;
 +      struct btree *b;
 +
 +      b = __btree_node_mem_alloc(c, GFP_KERNEL);
 +      if (!b)
 +              return NULL;
 +
 +      if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
 +              kfree(b);
 +              return NULL;
 +      }
 +
 +      bch2_btree_lock_init(&b->c, 0);
 +
 +      bc->used++;
 +      list_add(&b->list, &bc->freeable);
 +      return b;
 +}
 +
 +/* Btree in memory cache - hash table */
 +
 +void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
 +{
 +      int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
 +
 +      BUG_ON(ret);
 +
 +      /* Cause future lookups for this node to fail: */
 +      b->hash_val = 0;
 +}
 +
 +int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
 +{
 +      BUG_ON(b->hash_val);
 +      b->hash_val = btree_ptr_hash_val(&b->key);
 +
 +      return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
 +                                           bch_btree_cache_params);
 +}
 +
 +int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
 +                              unsigned level, enum btree_id id)
 +{
 +      int ret;
 +
 +      b->c.level      = level;
 +      b->c.btree_id   = id;
 +
 +      mutex_lock(&bc->lock);
 +      ret = __bch2_btree_node_hash_insert(bc, b);
 +      if (!ret)
 +              list_add_tail(&b->list, &bc->live);
 +      mutex_unlock(&bc->lock);
 +
 +      return ret;
 +}
 +
 +__flatten
 +static inline struct btree *btree_cache_find(struct btree_cache *bc,
 +                                   const struct bkey_i *k)
 +{
 +      u64 v = btree_ptr_hash_val(k);
 +
 +      return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
 +}
 +
 +/*
 + * this version is for btree nodes that have already been freed (we're not
 + * reaping a real btree node)
 + */
 +static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
 +{
 +      struct btree_cache *bc = &c->btree_cache;
 +      int ret = 0;
 +
 +      lockdep_assert_held(&bc->lock);
 +wait_on_io:
 +      if (b->flags & ((1U << BTREE_NODE_dirty)|
 +                      (1U << BTREE_NODE_read_in_flight)|
 +                      (1U << BTREE_NODE_write_in_flight))) {
 +              if (!flush)
 +                      return -BCH_ERR_ENOMEM_btree_node_reclaim;
 +
 +              /* XXX: waiting on IO with btree cache lock held */
 +              bch2_btree_node_wait_on_read(b);
 +              bch2_btree_node_wait_on_write(b);
 +      }
 +
 +      if (!six_trylock_intent(&b->c.lock))
 +              return -BCH_ERR_ENOMEM_btree_node_reclaim;
 +
 +      if (!six_trylock_write(&b->c.lock))
 +              goto out_unlock_intent;
 +
 +      /* recheck under lock */
 +      if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
 +                      (1U << BTREE_NODE_write_in_flight))) {
 +              if (!flush)
 +                      goto out_unlock;
 +              six_unlock_write(&b->c.lock);
 +              six_unlock_intent(&b->c.lock);
 +              goto wait_on_io;
 +      }
 +
 +      if (btree_node_noevict(b) ||
 +          btree_node_write_blocked(b) ||
 +          btree_node_will_make_reachable(b))
 +              goto out_unlock;
 +
 +      if (btree_node_dirty(b)) {
 +              if (!flush)
 +                      goto out_unlock;
 +              /*
 +               * Using the underscore version because we don't want to compact
 +               * bsets after the write, since this node is about to be evicted
 +               * - unless btree verify mode is enabled, since it runs out of
 +               * the post write cleanup:
 +               */
 +              if (bch2_verify_btree_ondisk)
 +                      bch2_btree_node_write(c, b, SIX_LOCK_intent,
 +                                            BTREE_WRITE_cache_reclaim);
 +              else
 +                      __bch2_btree_node_write(c, b,
 +                                              BTREE_WRITE_cache_reclaim);
 +
 +              six_unlock_write(&b->c.lock);
 +              six_unlock_intent(&b->c.lock);
 +              goto wait_on_io;
 +      }
 +out:
 +      if (b->hash_val && !ret)
 +              trace_and_count(c, btree_cache_reap, c, b);
 +      return ret;
 +out_unlock:
 +      six_unlock_write(&b->c.lock);
 +out_unlock_intent:
 +      six_unlock_intent(&b->c.lock);
 +      ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
 +      goto out;
 +}
 +
 +static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
 +{
 +      return __btree_node_reclaim(c, b, false);
 +}
 +
 +static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
 +{
 +      return __btree_node_reclaim(c, b, true);
 +}
 +
 +static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 +                                         struct shrink_control *sc)
 +{
-       struct bch_fs *c = container_of(shrink, struct bch_fs,
-                                       btree_cache.shrink);
++      struct bch_fs *c = shrink->private_data;
 +      struct btree_cache *bc = &c->btree_cache;
 +      struct btree *b, *t;
 +      unsigned long nr = sc->nr_to_scan;
 +      unsigned long can_free = 0;
 +      unsigned long freed = 0;
 +      unsigned long touched = 0;
 +      unsigned i, flags;
 +      unsigned long ret = SHRINK_STOP;
 +      bool trigger_writes = atomic_read(&bc->dirty) + nr >=
 +              bc->used * 3 / 4;
 +
 +      if (bch2_btree_shrinker_disabled)
 +              return SHRINK_STOP;
 +
 +      mutex_lock(&bc->lock);
 +      flags = memalloc_nofs_save();
 +
 +      /*
 +       * It's _really_ critical that we don't free too many btree nodes - we
 +       * have to always leave ourselves a reserve. The reserve is how we
 +       * guarantee that allocating memory for a new btree node can always
 +       * succeed, so that inserting keys into the btree can always succeed and
 +       * IO can always make forward progress:
 +       */
 +      can_free = btree_cache_can_free(bc);
 +      nr = min_t(unsigned long, nr, can_free);
 +
 +      i = 0;
 +      list_for_each_entry_safe(b, t, &bc->freeable, list) {
 +              /*
 +               * Leave a few nodes on the freeable list, so that a btree split
 +               * won't have to hit the system allocator:
 +               */
 +              if (++i <= 3)
 +                      continue;
 +
 +              touched++;
 +
 +              if (touched >= nr)
 +                      goto out;
 +
 +              if (!btree_node_reclaim(c, b)) {
 +                      btree_node_data_free(c, b);
 +                      six_unlock_write(&b->c.lock);
 +                      six_unlock_intent(&b->c.lock);
 +                      freed++;
 +              }
 +      }
 +restart:
 +      list_for_each_entry_safe(b, t, &bc->live, list) {
 +              touched++;
 +
 +              if (btree_node_accessed(b)) {
 +                      clear_btree_node_accessed(b);
 +              } else if (!btree_node_reclaim(c, b)) {
 +                      freed++;
 +                      btree_node_data_free(c, b);
 +
 +                      bch2_btree_node_hash_remove(bc, b);
 +                      six_unlock_write(&b->c.lock);
 +                      six_unlock_intent(&b->c.lock);
 +
 +                      if (freed == nr)
 +                              goto out_rotate;
 +              } else if (trigger_writes &&
 +                         btree_node_dirty(b) &&
 +                         !btree_node_will_make_reachable(b) &&
 +                         !btree_node_write_blocked(b) &&
 +                         six_trylock_read(&b->c.lock)) {
 +                      list_move(&bc->live, &b->list);
 +                      mutex_unlock(&bc->lock);
 +                      __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
 +                      six_unlock_read(&b->c.lock);
 +                      if (touched >= nr)
 +                              goto out_nounlock;
 +                      mutex_lock(&bc->lock);
 +                      goto restart;
 +              }
 +
 +              if (touched >= nr)
 +                      break;
 +      }
 +out_rotate:
 +      if (&t->list != &bc->live)
 +              list_move_tail(&bc->live, &t->list);
 +out:
 +      mutex_unlock(&bc->lock);
 +out_nounlock:
 +      ret = freed;
 +      memalloc_nofs_restore(flags);
 +      trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
 +      return ret;
 +}
 +
 +static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
 +                                          struct shrink_control *sc)
 +{
-       unregister_shrinker(&bc->shrink);
++      struct bch_fs *c = shrink->private_data;
 +      struct btree_cache *bc = &c->btree_cache;
 +
 +      if (bch2_btree_shrinker_disabled)
 +              return 0;
 +
 +      return btree_cache_can_free(bc);
 +}
 +
 +void bch2_fs_btree_cache_exit(struct bch_fs *c)
 +{
 +      struct btree_cache *bc = &c->btree_cache;
 +      struct btree *b;
 +      unsigned i, flags;
 +
-       bc->shrink.count_objects        = bch2_btree_cache_count;
-       bc->shrink.scan_objects         = bch2_btree_cache_scan;
-       bc->shrink.seeks                = 4;
-       ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
-       if (ret)
++      shrinker_free(bc->shrink);
 +
 +      /* vfree() can allocate memory: */
 +      flags = memalloc_nofs_save();
 +      mutex_lock(&bc->lock);
 +
 +      if (c->verify_data)
 +              list_move(&c->verify_data->list, &bc->live);
 +
 +      kvpfree(c->verify_ondisk, btree_bytes(c));
 +
 +      for (i = 0; i < btree_id_nr_alive(c); i++) {
 +              struct btree_root *r = bch2_btree_id_root(c, i);
 +
 +              if (r->b)
 +                      list_add(&r->b->list, &bc->live);
 +      }
 +
 +      list_splice(&bc->freeable, &bc->live);
 +
 +      while (!list_empty(&bc->live)) {
 +              b = list_first_entry(&bc->live, struct btree, list);
 +
 +              BUG_ON(btree_node_read_in_flight(b) ||
 +                     btree_node_write_in_flight(b));
 +
 +              if (btree_node_dirty(b))
 +                      bch2_btree_complete_write(c, b, btree_current_write(b));
 +              clear_btree_node_dirty_acct(c, b);
 +
 +              btree_node_data_free(c, b);
 +      }
 +
 +      BUG_ON(atomic_read(&c->btree_cache.dirty));
 +
 +      list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
 +
 +      while (!list_empty(&bc->freed_nonpcpu)) {
 +              b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
 +              list_del(&b->list);
 +              six_lock_exit(&b->c.lock);
 +              kfree(b);
 +      }
 +
 +      mutex_unlock(&bc->lock);
 +      memalloc_nofs_restore(flags);
 +
 +      if (bc->table_init_done)
 +              rhashtable_destroy(&bc->table);
 +}
 +
 +int bch2_fs_btree_cache_init(struct bch_fs *c)
 +{
 +      struct btree_cache *bc = &c->btree_cache;
++      struct shrinker *shrink;
 +      unsigned i;
 +      int ret = 0;
 +
 +      ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
 +      if (ret)
 +              goto err;
 +
 +      bc->table_init_done = true;
 +
 +      bch2_recalc_btree_reserve(c);
 +
 +      for (i = 0; i < bc->reserve; i++)
 +              if (!__bch2_btree_node_mem_alloc(c))
 +                      goto err;
 +
 +      list_splice_init(&bc->live, &bc->freeable);
 +
 +      mutex_init(&c->verify_lock);
 +
++      shrink = shrinker_alloc(0, "%s/btree_cache", c->name);
++      if (!shrink)
 +              goto err;
++      bc->shrink = shrink;
++      shrink->count_objects   = bch2_btree_cache_count;
++      shrink->scan_objects    = bch2_btree_cache_scan;
++      shrink->seeks           = 4;
++      shrink->private_data    = c;
++      shrinker_register(shrink);
 +
 +      return 0;
 +err:
 +      return -BCH_ERR_ENOMEM_fs_btree_cache_init;
 +}
 +
 +void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
 +{
 +      mutex_init(&bc->lock);
 +      INIT_LIST_HEAD(&bc->live);
 +      INIT_LIST_HEAD(&bc->freeable);
 +      INIT_LIST_HEAD(&bc->freed_pcpu);
 +      INIT_LIST_HEAD(&bc->freed_nonpcpu);
 +}
 +
 +/*
 + * We can only have one thread cannibalizing other cached btree nodes at a time,
 + * or we'll deadlock. We use an open coded mutex to ensure that, which a
 + * cannibalize_bucket() will take. This means every time we unlock the root of
 + * the btree, we need to release this lock if we have it held.
 + */
 +void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
 +{
 +      struct btree_cache *bc = &c->btree_cache;
 +
 +      if (bc->alloc_lock == current) {
 +              trace_and_count(c, btree_cache_cannibalize_unlock, c);
 +              bc->alloc_lock = NULL;
 +              closure_wake_up(&bc->alloc_wait);
 +      }
 +}
 +
 +int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
 +{
 +      struct btree_cache *bc = &c->btree_cache;
 +      struct task_struct *old;
 +
 +      old = cmpxchg(&bc->alloc_lock, NULL, current);
 +      if (old == NULL || old == current)
 +              goto success;
 +
 +      if (!cl) {
 +              trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
 +              return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
 +      }
 +
 +      closure_wait(&bc->alloc_wait, cl);
 +
 +      /* Try again, after adding ourselves to waitlist */
 +      old = cmpxchg(&bc->alloc_lock, NULL, current);
 +      if (old == NULL || old == current) {
 +              /* We raced */
 +              closure_wake_up(&bc->alloc_wait);
 +              goto success;
 +      }
 +
 +      trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
 +      return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
 +
 +success:
 +      trace_and_count(c, btree_cache_cannibalize_lock, c);
 +      return 0;
 +}
 +
 +static struct btree *btree_node_cannibalize(struct bch_fs *c)
 +{
 +      struct btree_cache *bc = &c->btree_cache;
 +      struct btree *b;
 +
 +      list_for_each_entry_reverse(b, &bc->live, list)
 +              if (!btree_node_reclaim(c, b))
 +                      return b;
 +
 +      while (1) {
 +              list_for_each_entry_reverse(b, &bc->live, list)
 +                      if (!btree_node_write_and_reclaim(c, b))
 +                              return b;
 +
 +              /*
 +               * Rare case: all nodes were intent-locked.
 +               * Just busy-wait.
 +               */
 +              WARN_ONCE(1, "btree cache cannibalize failed\n");
 +              cond_resched();
 +      }
 +}
 +
 +struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
 +{
 +      struct bch_fs *c = trans->c;
 +      struct btree_cache *bc = &c->btree_cache;
 +      struct list_head *freed = pcpu_read_locks
 +              ? &bc->freed_pcpu
 +              : &bc->freed_nonpcpu;
 +      struct btree *b, *b2;
 +      u64 start_time = local_clock();
 +      unsigned flags;
 +
 +      flags = memalloc_nofs_save();
 +      mutex_lock(&bc->lock);
 +
 +      /*
 +       * We never free struct btree itself, just the memory that holds the on
 +       * disk node. Check the freed list before allocating a new one:
 +       */
 +      list_for_each_entry(b, freed, list)
 +              if (!btree_node_reclaim(c, b)) {
 +                      list_del_init(&b->list);
 +                      goto got_node;
 +              }
 +
 +      b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
 +      if (!b) {
 +              mutex_unlock(&bc->lock);
 +              bch2_trans_unlock(trans);
 +              b = __btree_node_mem_alloc(c, GFP_KERNEL);
 +              if (!b)
 +                      goto err;
 +              mutex_lock(&bc->lock);
 +      }
 +
 +      bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
 +
 +      BUG_ON(!six_trylock_intent(&b->c.lock));
 +      BUG_ON(!six_trylock_write(&b->c.lock));
 +got_node:
 +
 +      /*
 +       * btree_free() doesn't free memory; it sticks the node on the end of
 +       * the list. Check if there's any freed nodes there:
 +       */
 +      list_for_each_entry(b2, &bc->freeable, list)
 +              if (!btree_node_reclaim(c, b2)) {
 +                      swap(b->data, b2->data);
 +                      swap(b->aux_data, b2->aux_data);
 +                      btree_node_to_freedlist(bc, b2);
 +                      six_unlock_write(&b2->c.lock);
 +                      six_unlock_intent(&b2->c.lock);
 +                      goto got_mem;
 +              }
 +
 +      mutex_unlock(&bc->lock);
 +
 +      if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
 +              bch2_trans_unlock(trans);
 +              if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
 +                      goto err;
 +      }
 +
 +      mutex_lock(&bc->lock);
 +      bc->used++;
 +got_mem:
 +      mutex_unlock(&bc->lock);
 +
 +      BUG_ON(btree_node_hashed(b));
 +      BUG_ON(btree_node_dirty(b));
 +      BUG_ON(btree_node_write_in_flight(b));
 +out:
 +      b->flags                = 0;
 +      b->written              = 0;
 +      b->nsets                = 0;
 +      b->sib_u64s[0]          = 0;
 +      b->sib_u64s[1]          = 0;
 +      b->whiteout_u64s        = 0;
 +      bch2_btree_keys_init(b);
 +      set_btree_node_accessed(b);
 +
 +      bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
 +                             start_time);
 +
 +      memalloc_nofs_restore(flags);
 +      return b;
 +err:
 +      mutex_lock(&bc->lock);
 +
 +      /* Try to cannibalize another cached btree node: */
 +      if (bc->alloc_lock == current) {
 +              b2 = btree_node_cannibalize(c);
 +              clear_btree_node_just_written(b2);
 +              bch2_btree_node_hash_remove(bc, b2);
 +
 +              if (b) {
 +                      swap(b->data, b2->data);
 +                      swap(b->aux_data, b2->aux_data);
 +                      btree_node_to_freedlist(bc, b2);
 +                      six_unlock_write(&b2->c.lock);
 +                      six_unlock_intent(&b2->c.lock);
 +              } else {
 +                      b = b2;
 +                      list_del_init(&b->list);
 +              }
 +
 +              mutex_unlock(&bc->lock);
 +
 +              trace_and_count(c, btree_cache_cannibalize, c);
 +              goto out;
 +      }
 +
 +      mutex_unlock(&bc->lock);
 +      memalloc_nofs_restore(flags);
 +      return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
 +}
 +
 +/* Slowpath, don't want it inlined into btree_iter_traverse() */
 +static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
 +                              struct btree_path *path,
 +                              const struct bkey_i *k,
 +                              enum btree_id btree_id,
 +                              unsigned level,
 +                              enum six_lock_type lock_type,
 +                              bool sync)
 +{
 +      struct bch_fs *c = trans->c;
 +      struct btree_cache *bc = &c->btree_cache;
 +      struct btree *b;
 +      u32 seq;
 +
 +      BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
 +      /*
 +       * Parent node must be locked, else we could read in a btree node that's
 +       * been freed:
 +       */
 +      if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
 +              trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
 +              return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
 +      }
 +
 +      b = bch2_btree_node_mem_alloc(trans, level != 0);
 +
 +      if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
 +              trans->memory_allocation_failure = true;
 +              trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
 +              return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
 +      }
 +
 +      if (IS_ERR(b))
 +              return b;
 +
 +      /*
 +       * Btree nodes read in from disk should not have the accessed bit set
 +       * initially, so that linear scans don't thrash the cache:
 +       */
 +      clear_btree_node_accessed(b);
 +
 +      bkey_copy(&b->key, k);
 +      if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
 +              /* raced with another fill: */
 +
 +              /* mark as unhashed... */
 +              b->hash_val = 0;
 +
 +              mutex_lock(&bc->lock);
 +              list_add(&b->list, &bc->freeable);
 +              mutex_unlock(&bc->lock);
 +
 +              six_unlock_write(&b->c.lock);
 +              six_unlock_intent(&b->c.lock);
 +              return NULL;
 +      }
 +
 +      set_btree_node_read_in_flight(b);
 +
 +      six_unlock_write(&b->c.lock);
 +      seq = six_lock_seq(&b->c.lock);
 +      six_unlock_intent(&b->c.lock);
 +
 +      /* Unlock before doing IO: */
 +      if (path && sync)
 +              bch2_trans_unlock_noassert(trans);
 +
 +      bch2_btree_node_read(c, b, sync);
 +
 +      if (!sync)
 +              return NULL;
 +
 +      if (path) {
 +              int ret = bch2_trans_relock(trans) ?:
 +                      bch2_btree_path_relock_intent(trans, path);
 +              if (ret) {
 +                      BUG_ON(!trans->restarted);
 +                      return ERR_PTR(ret);
 +              }
 +      }
 +
 +      if (!six_relock_type(&b->c.lock, lock_type, seq)) {
 +              if (path)
 +                      trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
 +              return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
 +      }
 +
 +      return b;
 +}
 +
 +static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
 +{
 +      struct printbuf buf = PRINTBUF;
 +
 +      if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
 +              return;
 +
 +      prt_printf(&buf,
 +             "btree node header doesn't match ptr\n"
 +             "btree %s level %u\n"
 +             "ptr: ",
 +             bch2_btree_ids[b->c.btree_id], b->c.level);
 +      bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
 +
 +      prt_printf(&buf, "\nheader: btree %s level %llu\n"
 +             "min ",
 +             bch2_btree_ids[BTREE_NODE_ID(b->data)],
 +             BTREE_NODE_LEVEL(b->data));
 +      bch2_bpos_to_text(&buf, b->data->min_key);
 +
 +      prt_printf(&buf, "\nmax ");
 +      bch2_bpos_to_text(&buf, b->data->max_key);
 +
 +      bch2_fs_inconsistent(c, "%s", buf.buf);
 +      printbuf_exit(&buf);
 +}
 +
 +static inline void btree_check_header(struct bch_fs *c, struct btree *b)
 +{
 +      if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
 +          b->c.level != BTREE_NODE_LEVEL(b->data) ||
 +          !bpos_eq(b->data->max_key, b->key.k.p) ||
 +          (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
 +           !bpos_eq(b->data->min_key,
 +                    bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
 +              btree_bad_header(c, b);
 +}
 +
 +static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
 +                                         const struct bkey_i *k, unsigned level,
 +                                         enum six_lock_type lock_type,
 +                                         unsigned long trace_ip)
 +{
 +      struct bch_fs *c = trans->c;
 +      struct btree_cache *bc = &c->btree_cache;
 +      struct btree *b;
 +      struct bset_tree *t;
 +      bool need_relock = false;
 +      int ret;
 +
 +      EBUG_ON(level >= BTREE_MAX_DEPTH);
 +retry:
 +      b = btree_cache_find(bc, k);
 +      if (unlikely(!b)) {
 +              /*
 +               * We must have the parent locked to call bch2_btree_node_fill(),
 +               * else we could read in a btree node from disk that's been
 +               * freed:
 +               */
 +              b = bch2_btree_node_fill(trans, path, k, path->btree_id,
 +                                       level, lock_type, true);
 +              need_relock = true;
 +
 +              /* We raced and found the btree node in the cache */
 +              if (!b)
 +                      goto retry;
 +
 +              if (IS_ERR(b))
 +                      return b;
 +      } else {
 +              if (btree_node_read_locked(path, level + 1))
 +                      btree_node_unlock(trans, path, level + 1);
 +
 +              ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
 +              if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +                      return ERR_PTR(ret);
 +
 +              BUG_ON(ret);
 +
 +              if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
 +                           b->c.level != level ||
 +                           race_fault())) {
 +                      six_unlock_type(&b->c.lock, lock_type);
 +                      if (bch2_btree_node_relock(trans, path, level + 1))
 +                              goto retry;
 +
 +                      trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
 +                      return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
 +              }
 +
 +              /* avoid atomic set bit if it's not needed: */
 +              if (!btree_node_accessed(b))
 +                      set_btree_node_accessed(b);
 +      }
 +
 +      if (unlikely(btree_node_read_in_flight(b))) {
 +              u32 seq = six_lock_seq(&b->c.lock);
 +
 +              six_unlock_type(&b->c.lock, lock_type);
 +              bch2_trans_unlock(trans);
 +              need_relock = true;
 +
 +              bch2_btree_node_wait_on_read(b);
 +
 +              /*
 +               * should_be_locked is not set on this path yet, so we need to
 +               * relock it specifically:
 +               */
 +              if (!six_relock_type(&b->c.lock, lock_type, seq))
 +                      goto retry;
 +      }
 +
 +      if (unlikely(need_relock)) {
 +              ret = bch2_trans_relock(trans) ?:
 +                      bch2_btree_path_relock_intent(trans, path);
 +              if (ret) {
 +                      six_unlock_type(&b->c.lock, lock_type);
 +                      return ERR_PTR(ret);
 +              }
 +      }
 +
 +      prefetch(b->aux_data);
 +
 +      for_each_bset(b, t) {
 +              void *p = (u64 *) b->aux_data + t->aux_data_offset;
 +
 +              prefetch(p + L1_CACHE_BYTES * 0);
 +              prefetch(p + L1_CACHE_BYTES * 1);
 +              prefetch(p + L1_CACHE_BYTES * 2);
 +      }
 +
 +      if (unlikely(btree_node_read_error(b))) {
 +              six_unlock_type(&b->c.lock, lock_type);
 +              return ERR_PTR(-EIO);
 +      }
 +
 +      EBUG_ON(b->c.btree_id != path->btree_id);
 +      EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
 +      btree_check_header(c, b);
 +
 +      return b;
 +}
 +
 +/**
 + * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
 + * in from disk if necessary.
 + *
 + * @trans:    btree transaction object
 + * @path:     btree_path being traversed
 + * @k:                pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
 + * @level:    level of btree node being looked up (0 == leaf node)
 + * @lock_type:        SIX_LOCK_read or SIX_LOCK_intent
 + * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
 + *
 + * The btree node will have either a read or a write lock held, depending on
 + * the @write parameter.
 + *
 + * Returns: btree node or ERR_PTR()
 + */
 +struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
 +                                const struct bkey_i *k, unsigned level,
 +                                enum six_lock_type lock_type,
 +                                unsigned long trace_ip)
 +{
 +      struct bch_fs *c = trans->c;
 +      struct btree *b;
 +      struct bset_tree *t;
 +      int ret;
 +
 +      EBUG_ON(level >= BTREE_MAX_DEPTH);
 +
 +      b = btree_node_mem_ptr(k);
 +
 +      /*
 +       * Check b->hash_val _before_ calling btree_node_lock() - this might not
 +       * be the node we want anymore, and trying to lock the wrong node could
 +       * cause an unneccessary transaction restart:
 +       */
 +      if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
 +                   !b ||
 +                   b->hash_val != btree_ptr_hash_val(k)))
 +              return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
 +
 +      if (btree_node_read_locked(path, level + 1))
 +              btree_node_unlock(trans, path, level + 1);
 +
 +      ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
 +      if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +              return ERR_PTR(ret);
 +
 +      BUG_ON(ret);
 +
 +      if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
 +                   b->c.level != level ||
 +                   race_fault())) {
 +              six_unlock_type(&b->c.lock, lock_type);
 +              if (bch2_btree_node_relock(trans, path, level + 1))
 +                      return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
 +
 +              trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
 +              return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
 +      }
 +
 +      if (unlikely(btree_node_read_in_flight(b))) {
 +              six_unlock_type(&b->c.lock, lock_type);
 +              return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
 +      }
 +
 +      prefetch(b->aux_data);
 +
 +      for_each_bset(b, t) {
 +              void *p = (u64 *) b->aux_data + t->aux_data_offset;
 +
 +              prefetch(p + L1_CACHE_BYTES * 0);
 +              prefetch(p + L1_CACHE_BYTES * 1);
 +              prefetch(p + L1_CACHE_BYTES * 2);
 +      }
 +
 +      /* avoid atomic set bit if it's not needed: */
 +      if (!btree_node_accessed(b))
 +              set_btree_node_accessed(b);
 +
 +      if (unlikely(btree_node_read_error(b))) {
 +              six_unlock_type(&b->c.lock, lock_type);
 +              return ERR_PTR(-EIO);
 +      }
 +
 +      EBUG_ON(b->c.btree_id != path->btree_id);
 +      EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
 +      btree_check_header(c, b);
 +
 +      return b;
 +}
 +
 +struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
 +                                       const struct bkey_i *k,
 +                                       enum btree_id btree_id,
 +                                       unsigned level,
 +                                       bool nofill)
 +{
 +      struct bch_fs *c = trans->c;
 +      struct btree_cache *bc = &c->btree_cache;
 +      struct btree *b;
 +      struct bset_tree *t;
 +      int ret;
 +
 +      EBUG_ON(level >= BTREE_MAX_DEPTH);
 +
 +      if (c->opts.btree_node_mem_ptr_optimization) {
 +              b = btree_node_mem_ptr(k);
 +              if (b)
 +                      goto lock_node;
 +      }
 +retry:
 +      b = btree_cache_find(bc, k);
 +      if (unlikely(!b)) {
 +              if (nofill)
 +                      goto out;
 +
 +              b = bch2_btree_node_fill(trans, NULL, k, btree_id,
 +                                       level, SIX_LOCK_read, true);
 +
 +              /* We raced and found the btree node in the cache */
 +              if (!b)
 +                      goto retry;
 +
 +              if (IS_ERR(b) &&
 +                  !bch2_btree_cache_cannibalize_lock(c, NULL))
 +                      goto retry;
 +
 +              if (IS_ERR(b))
 +                      goto out;
 +      } else {
 +lock_node:
 +              ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
 +              if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +                      return ERR_PTR(ret);
 +
 +              BUG_ON(ret);
 +
 +              if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
 +                           b->c.btree_id != btree_id ||
 +                           b->c.level != level)) {
 +                      six_unlock_read(&b->c.lock);
 +                      goto retry;
 +              }
 +      }
 +
 +      /* XXX: waiting on IO with btree locks held: */
 +      __bch2_btree_node_wait_on_read(b);
 +
 +      prefetch(b->aux_data);
 +
 +      for_each_bset(b, t) {
 +              void *p = (u64 *) b->aux_data + t->aux_data_offset;
 +
 +              prefetch(p + L1_CACHE_BYTES * 0);
 +              prefetch(p + L1_CACHE_BYTES * 1);
 +              prefetch(p + L1_CACHE_BYTES * 2);
 +      }
 +
 +      /* avoid atomic set bit if it's not needed: */
 +      if (!btree_node_accessed(b))
 +              set_btree_node_accessed(b);
 +
 +      if (unlikely(btree_node_read_error(b))) {
 +              six_unlock_read(&b->c.lock);
 +              b = ERR_PTR(-EIO);
 +              goto out;
 +      }
 +
 +      EBUG_ON(b->c.btree_id != btree_id);
 +      EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
 +      btree_check_header(c, b);
 +out:
 +      bch2_btree_cache_cannibalize_unlock(c);
 +      return b;
 +}
 +
 +int bch2_btree_node_prefetch(struct btree_trans *trans,
 +                           struct btree_path *path,
 +                           const struct bkey_i *k,
 +                           enum btree_id btree_id, unsigned level)
 +{
 +      struct bch_fs *c = trans->c;
 +      struct btree_cache *bc = &c->btree_cache;
 +      struct btree *b;
 +
 +      BUG_ON(trans && !btree_node_locked(path, level + 1));
 +      BUG_ON(level >= BTREE_MAX_DEPTH);
 +
 +      b = btree_cache_find(bc, k);
 +      if (b)
 +              return 0;
 +
 +      b = bch2_btree_node_fill(trans, path, k, btree_id,
 +                               level, SIX_LOCK_read, false);
 +      return PTR_ERR_OR_ZERO(b);
 +}
 +
 +void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
 +{
 +      struct bch_fs *c = trans->c;
 +      struct btree_cache *bc = &c->btree_cache;
 +      struct btree *b;
 +
 +      b = btree_cache_find(bc, k);
 +      if (!b)
 +              return;
 +wait_on_io:
 +      /* not allowed to wait on io with btree locks held: */
 +
 +      /* XXX we're called from btree_gc which will be holding other btree
 +       * nodes locked
 +       */
 +      __bch2_btree_node_wait_on_read(b);
 +      __bch2_btree_node_wait_on_write(b);
 +
 +      btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
 +      btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
 +
 +      if (btree_node_dirty(b)) {
 +              __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
 +              six_unlock_write(&b->c.lock);
 +              six_unlock_intent(&b->c.lock);
 +              goto wait_on_io;
 +      }
 +
 +      BUG_ON(btree_node_dirty(b));
 +
 +      mutex_lock(&bc->lock);
 +      btree_node_data_free(c, b);
 +      bch2_btree_node_hash_remove(bc, b);
 +      mutex_unlock(&bc->lock);
 +
 +      six_unlock_write(&b->c.lock);
 +      six_unlock_intent(&b->c.lock);
 +}
 +
 +void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 +                           const struct btree *b)
 +{
 +      struct bset_stats stats;
 +
 +      memset(&stats, 0, sizeof(stats));
 +
 +      bch2_btree_keys_stats(b, &stats);
 +
 +      prt_printf(out, "l %u ", b->c.level);
 +      bch2_bpos_to_text(out, b->data->min_key);
 +      prt_printf(out, " - ");
 +      bch2_bpos_to_text(out, b->data->max_key);
 +      prt_printf(out, ":\n"
 +             "    ptrs: ");
 +      bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
 +      prt_newline(out);
 +
 +      prt_printf(out,
 +             "    format: ");
 +      bch2_bkey_format_to_text(out, &b->format);
 +
 +      prt_printf(out,
 +             "    unpack fn len: %u\n"
 +             "    bytes used %zu/%zu (%zu%% full)\n"
 +             "    sib u64s: %u, %u (merge threshold %u)\n"
 +             "    nr packed keys %u\n"
 +             "    nr unpacked keys %u\n"
 +             "    floats %zu\n"
 +             "    failed unpacked %zu\n",
 +             b->unpack_fn_len,
 +             b->nr.live_u64s * sizeof(u64),
 +             btree_bytes(c) - sizeof(struct btree_node),
 +             b->nr.live_u64s * 100 / btree_max_u64s(c),
 +             b->sib_u64s[0],
 +             b->sib_u64s[1],
 +             c->btree_foreground_merge_threshold,
 +             b->nr.packed_keys,
 +             b->nr.unpacked_keys,
 +             stats.floats,
 +             stats.failed);
 +}
 +
 +void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c)
 +{
 +      prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
 +      prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
 +      prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
 +}
index 29a0b566a4fe9ebdfd07666249c0eed23d427bea,0000000000000000000000000000000000000000..f9a5e38a085bbfb280fbe439ca2a6b1f0ba2f1af
mode 100644,000000..100644
--- /dev/null
@@@ -1,1072 -1,0 +1,1075 @@@
-       struct bch_fs *c = container_of(shrink, struct bch_fs,
-                                       btree_key_cache.shrink);
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
 +#include "btree_cache.h"
 +#include "btree_iter.h"
 +#include "btree_key_cache.h"
 +#include "btree_locking.h"
 +#include "btree_update.h"
 +#include "errcode.h"
 +#include "error.h"
 +#include "journal.h"
 +#include "journal_reclaim.h"
 +#include "trace.h"
 +
 +#include <linux/sched/mm.h>
 +
 +static inline bool btree_uses_pcpu_readers(enum btree_id id)
 +{
 +      return id == BTREE_ID_subvolumes;
 +}
 +
 +static struct kmem_cache *bch2_key_cache;
 +
 +static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
 +                                     const void *obj)
 +{
 +      const struct bkey_cached *ck = obj;
 +      const struct bkey_cached_key *key = arg->key;
 +
 +      return ck->key.btree_id != key->btree_id ||
 +              !bpos_eq(ck->key.pos, key->pos);
 +}
 +
 +static const struct rhashtable_params bch2_btree_key_cache_params = {
 +      .head_offset    = offsetof(struct bkey_cached, hash),
 +      .key_offset     = offsetof(struct bkey_cached, key),
 +      .key_len        = sizeof(struct bkey_cached_key),
 +      .obj_cmpfn      = bch2_btree_key_cache_cmp_fn,
 +};
 +
 +__flatten
 +inline struct bkey_cached *
 +bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
 +{
 +      struct bkey_cached_key key = {
 +              .btree_id       = btree_id,
 +              .pos            = pos,
 +      };
 +
 +      return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
 +                                    bch2_btree_key_cache_params);
 +}
 +
 +static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
 +{
 +      if (!six_trylock_intent(&ck->c.lock))
 +              return false;
 +
 +      if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 +              six_unlock_intent(&ck->c.lock);
 +              return false;
 +      }
 +
 +      if (!six_trylock_write(&ck->c.lock)) {
 +              six_unlock_intent(&ck->c.lock);
 +              return false;
 +      }
 +
 +      return true;
 +}
 +
 +static void bkey_cached_evict(struct btree_key_cache *c,
 +                            struct bkey_cached *ck)
 +{
 +      BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
 +                                    bch2_btree_key_cache_params));
 +      memset(&ck->key, ~0, sizeof(ck->key));
 +
 +      atomic_long_dec(&c->nr_keys);
 +}
 +
 +static void bkey_cached_free(struct btree_key_cache *bc,
 +                           struct bkey_cached *ck)
 +{
 +      struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 +
 +      BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
 +
 +      ck->btree_trans_barrier_seq =
 +              start_poll_synchronize_srcu(&c->btree_trans_barrier);
 +
 +      if (ck->c.lock.readers)
 +              list_move_tail(&ck->list, &bc->freed_pcpu);
 +      else
 +              list_move_tail(&ck->list, &bc->freed_nonpcpu);
 +      atomic_long_inc(&bc->nr_freed);
 +
 +      kfree(ck->k);
 +      ck->k           = NULL;
 +      ck->u64s        = 0;
 +
 +      six_unlock_write(&ck->c.lock);
 +      six_unlock_intent(&ck->c.lock);
 +}
 +
 +#ifdef __KERNEL__
 +static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
 +                                                 struct bkey_cached *ck)
 +{
 +      struct bkey_cached *pos;
 +
 +      list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
 +              if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
 +                               pos->btree_trans_barrier_seq)) {
 +                      list_move(&ck->list, &pos->list);
 +                      return;
 +              }
 +      }
 +
 +      list_move(&ck->list, &bc->freed_nonpcpu);
 +}
 +#endif
 +
 +static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
 +                                       struct bkey_cached *ck)
 +{
 +      BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
 +
 +      if (!ck->c.lock.readers) {
 +#ifdef __KERNEL__
 +              struct btree_key_cache_freelist *f;
 +              bool freed = false;
 +
 +              preempt_disable();
 +              f = this_cpu_ptr(bc->pcpu_freed);
 +
 +              if (f->nr < ARRAY_SIZE(f->objs)) {
 +                      f->objs[f->nr++] = ck;
 +                      freed = true;
 +              }
 +              preempt_enable();
 +
 +              if (!freed) {
 +                      mutex_lock(&bc->lock);
 +                      preempt_disable();
 +                      f = this_cpu_ptr(bc->pcpu_freed);
 +
 +                      while (f->nr > ARRAY_SIZE(f->objs) / 2) {
 +                              struct bkey_cached *ck2 = f->objs[--f->nr];
 +
 +                              __bkey_cached_move_to_freelist_ordered(bc, ck2);
 +                      }
 +                      preempt_enable();
 +
 +                      __bkey_cached_move_to_freelist_ordered(bc, ck);
 +                      mutex_unlock(&bc->lock);
 +              }
 +#else
 +              mutex_lock(&bc->lock);
 +              list_move_tail(&ck->list, &bc->freed_nonpcpu);
 +              mutex_unlock(&bc->lock);
 +#endif
 +      } else {
 +              mutex_lock(&bc->lock);
 +              list_move_tail(&ck->list, &bc->freed_pcpu);
 +              mutex_unlock(&bc->lock);
 +      }
 +}
 +
 +static void bkey_cached_free_fast(struct btree_key_cache *bc,
 +                                struct bkey_cached *ck)
 +{
 +      struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 +
 +      ck->btree_trans_barrier_seq =
 +              start_poll_synchronize_srcu(&c->btree_trans_barrier);
 +
 +      list_del_init(&ck->list);
 +      atomic_long_inc(&bc->nr_freed);
 +
 +      kfree(ck->k);
 +      ck->k           = NULL;
 +      ck->u64s        = 0;
 +
 +      bkey_cached_move_to_freelist(bc, ck);
 +
 +      six_unlock_write(&ck->c.lock);
 +      six_unlock_intent(&ck->c.lock);
 +}
 +
 +static struct bkey_cached *
 +bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 +                bool *was_new)
 +{
 +      struct bch_fs *c = trans->c;
 +      struct btree_key_cache *bc = &c->btree_key_cache;
 +      struct bkey_cached *ck = NULL;
 +      bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
 +      int ret;
 +
 +      if (!pcpu_readers) {
 +#ifdef __KERNEL__
 +              struct btree_key_cache_freelist *f;
 +
 +              preempt_disable();
 +              f = this_cpu_ptr(bc->pcpu_freed);
 +              if (f->nr)
 +                      ck = f->objs[--f->nr];
 +              preempt_enable();
 +
 +              if (!ck) {
 +                      mutex_lock(&bc->lock);
 +                      preempt_disable();
 +                      f = this_cpu_ptr(bc->pcpu_freed);
 +
 +                      while (!list_empty(&bc->freed_nonpcpu) &&
 +                             f->nr < ARRAY_SIZE(f->objs) / 2) {
 +                              ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
 +                              list_del_init(&ck->list);
 +                              f->objs[f->nr++] = ck;
 +                      }
 +
 +                      ck = f->nr ? f->objs[--f->nr] : NULL;
 +                      preempt_enable();
 +                      mutex_unlock(&bc->lock);
 +              }
 +#else
 +              mutex_lock(&bc->lock);
 +              if (!list_empty(&bc->freed_nonpcpu)) {
 +                      ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
 +                      list_del_init(&ck->list);
 +              }
 +              mutex_unlock(&bc->lock);
 +#endif
 +      } else {
 +              mutex_lock(&bc->lock);
 +              if (!list_empty(&bc->freed_pcpu)) {
 +                      ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
 +                      list_del_init(&ck->list);
 +              }
 +              mutex_unlock(&bc->lock);
 +      }
 +
 +      if (ck) {
 +              ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
 +              if (unlikely(ret)) {
 +                      bkey_cached_move_to_freelist(bc, ck);
 +                      return ERR_PTR(ret);
 +              }
 +
 +              path->l[0].b = (void *) ck;
 +              path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
 +              mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 +
 +              ret = bch2_btree_node_lock_write(trans, path, &ck->c);
 +              if (unlikely(ret)) {
 +                      btree_node_unlock(trans, path, 0);
 +                      bkey_cached_move_to_freelist(bc, ck);
 +                      return ERR_PTR(ret);
 +              }
 +
 +              return ck;
 +      }
 +
 +      ck = allocate_dropping_locks(trans, ret,
 +                      kmem_cache_zalloc(bch2_key_cache, _gfp));
 +      if (ret) {
 +              kmem_cache_free(bch2_key_cache, ck);
 +              return ERR_PTR(ret);
 +      }
 +
 +      if (!ck)
 +              return NULL;
 +
 +      INIT_LIST_HEAD(&ck->list);
 +      bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
 +
 +      ck->c.cached = true;
 +      BUG_ON(!six_trylock_intent(&ck->c.lock));
 +      BUG_ON(!six_trylock_write(&ck->c.lock));
 +      *was_new = true;
 +      return ck;
 +}
 +
 +static struct bkey_cached *
 +bkey_cached_reuse(struct btree_key_cache *c)
 +{
 +      struct bucket_table *tbl;
 +      struct rhash_head *pos;
 +      struct bkey_cached *ck;
 +      unsigned i;
 +
 +      mutex_lock(&c->lock);
 +      rcu_read_lock();
 +      tbl = rht_dereference_rcu(c->table.tbl, &c->table);
 +      for (i = 0; i < tbl->size; i++)
 +              rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
 +                      if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
 +                          bkey_cached_lock_for_evict(ck)) {
 +                              bkey_cached_evict(c, ck);
 +                              goto out;
 +                      }
 +              }
 +      ck = NULL;
 +out:
 +      rcu_read_unlock();
 +      mutex_unlock(&c->lock);
 +      return ck;
 +}
 +
 +static struct bkey_cached *
 +btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
 +{
 +      struct bch_fs *c = trans->c;
 +      struct btree_key_cache *bc = &c->btree_key_cache;
 +      struct bkey_cached *ck;
 +      bool was_new = false;
 +
 +      ck = bkey_cached_alloc(trans, path, &was_new);
 +      if (IS_ERR(ck))
 +              return ck;
 +
 +      if (unlikely(!ck)) {
 +              ck = bkey_cached_reuse(bc);
 +              if (unlikely(!ck)) {
 +                      bch_err(c, "error allocating memory for key cache item, btree %s",
 +                              bch2_btree_ids[path->btree_id]);
 +                      return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
 +              }
 +
 +              mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 +      }
 +
 +      ck->c.level             = 0;
 +      ck->c.btree_id          = path->btree_id;
 +      ck->key.btree_id        = path->btree_id;
 +      ck->key.pos             = path->pos;
 +      ck->valid               = false;
 +      ck->flags               = 1U << BKEY_CACHED_ACCESSED;
 +
 +      if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
 +                                        &ck->hash,
 +                                        bch2_btree_key_cache_params))) {
 +              /* We raced with another fill: */
 +
 +              if (likely(was_new)) {
 +                      six_unlock_write(&ck->c.lock);
 +                      six_unlock_intent(&ck->c.lock);
 +                      kfree(ck);
 +              } else {
 +                      bkey_cached_free_fast(bc, ck);
 +              }
 +
 +              mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
 +              return NULL;
 +      }
 +
 +      atomic_long_inc(&bc->nr_keys);
 +
 +      six_unlock_write(&ck->c.lock);
 +
 +      return ck;
 +}
 +
 +static int btree_key_cache_fill(struct btree_trans *trans,
 +                              struct btree_path *ck_path,
 +                              struct bkey_cached *ck)
 +{
 +      struct btree_iter iter;
 +      struct bkey_s_c k;
 +      unsigned new_u64s = 0;
 +      struct bkey_i *new_k = NULL;
 +      int ret;
 +
 +      k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos,
 +                             BTREE_ITER_KEY_CACHE_FILL|
 +                             BTREE_ITER_CACHED_NOFILL);
 +      ret = bkey_err(k);
 +      if (ret)
 +              goto err;
 +
 +      if (!bch2_btree_node_relock(trans, ck_path, 0)) {
 +              trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
 +              ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
 +              goto err;
 +      }
 +
 +      /*
 +       * bch2_varint_decode can read past the end of the buffer by at
 +       * most 7 bytes (it won't be used):
 +       */
 +      new_u64s = k.k->u64s + 1;
 +
 +      /*
 +       * Allocate some extra space so that the transaction commit path is less
 +       * likely to have to reallocate, since that requires a transaction
 +       * restart:
 +       */
 +      new_u64s = min(256U, (new_u64s * 3) / 2);
 +
 +      if (new_u64s > ck->u64s) {
 +              new_u64s = roundup_pow_of_two(new_u64s);
 +              new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
 +              if (!new_k) {
 +                      bch2_trans_unlock(trans);
 +
 +                      new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
 +                      if (!new_k) {
 +                              bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
 +                                      bch2_btree_ids[ck->key.btree_id], new_u64s);
 +                              ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
 +                              goto err;
 +                      }
 +
 +                      if (!bch2_btree_node_relock(trans, ck_path, 0)) {
 +                              kfree(new_k);
 +                              trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
 +                              ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
 +                              goto err;
 +                      }
 +
 +                      ret = bch2_trans_relock(trans);
 +                      if (ret) {
 +                              kfree(new_k);
 +                              goto err;
 +                      }
 +              }
 +      }
 +
 +      ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
 +      if (ret) {
 +              kfree(new_k);
 +              goto err;
 +      }
 +
 +      if (new_k) {
 +              kfree(ck->k);
 +              ck->u64s = new_u64s;
 +              ck->k = new_k;
 +      }
 +
 +      bkey_reassemble(ck->k, k);
 +      ck->valid = true;
 +      bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
 +
 +      /* We're not likely to need this iterator again: */
 +      set_btree_iter_dontneed(&iter);
 +err:
 +      bch2_trans_iter_exit(trans, &iter);
 +      return ret;
 +}
 +
 +static noinline int
 +bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
 +                                       unsigned flags)
 +{
 +      struct bch_fs *c = trans->c;
 +      struct bkey_cached *ck;
 +      int ret = 0;
 +
 +      BUG_ON(path->level);
 +
 +      path->l[1].b = NULL;
 +
 +      if (bch2_btree_node_relock_notrace(trans, path, 0)) {
 +              ck = (void *) path->l[0].b;
 +              goto fill;
 +      }
 +retry:
 +      ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
 +      if (!ck) {
 +              ck = btree_key_cache_create(trans, path);
 +              ret = PTR_ERR_OR_ZERO(ck);
 +              if (ret)
 +                      goto err;
 +              if (!ck)
 +                      goto retry;
 +
 +              mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 +              path->locks_want = 1;
 +      } else {
 +              enum six_lock_type lock_want = __btree_lock_want(path, 0);
 +
 +              ret = btree_node_lock(trans, path, (void *) ck, 0,
 +                                    lock_want, _THIS_IP_);
 +              if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +                      goto err;
 +
 +              BUG_ON(ret);
 +
 +              if (ck->key.btree_id != path->btree_id ||
 +                  !bpos_eq(ck->key.pos, path->pos)) {
 +                      six_unlock_type(&ck->c.lock, lock_want);
 +                      goto retry;
 +              }
 +
 +              mark_btree_node_locked(trans, path, 0,
 +                                     (enum btree_node_locked_type) lock_want);
 +      }
 +
 +      path->l[0].lock_seq     = six_lock_seq(&ck->c.lock);
 +      path->l[0].b            = (void *) ck;
 +fill:
 +      path->uptodate = BTREE_ITER_UPTODATE;
 +
 +      if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
 +              /*
 +               * Using the underscore version because we haven't set
 +               * path->uptodate yet:
 +               */
 +              if (!path->locks_want &&
 +                  !__bch2_btree_path_upgrade(trans, path, 1)) {
 +                      trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
 +                      ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
 +                      goto err;
 +              }
 +
 +              ret = btree_key_cache_fill(trans, path, ck);
 +              if (ret)
 +                      goto err;
 +
 +              ret = bch2_btree_path_relock(trans, path, _THIS_IP_);
 +              if (ret)
 +                      goto err;
 +
 +              path->uptodate = BTREE_ITER_UPTODATE;
 +      }
 +
 +      if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
 +              set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 +
 +      BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
 +      BUG_ON(path->uptodate);
 +
 +      return ret;
 +err:
 +      path->uptodate = BTREE_ITER_NEED_TRAVERSE;
 +      if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 +              btree_node_unlock(trans, path, 0);
 +              path->l[0].b = ERR_PTR(ret);
 +      }
 +      return ret;
 +}
 +
 +int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
 +                                  unsigned flags)
 +{
 +      struct bch_fs *c = trans->c;
 +      struct bkey_cached *ck;
 +      int ret = 0;
 +
 +      EBUG_ON(path->level);
 +
 +      path->l[1].b = NULL;
 +
 +      if (bch2_btree_node_relock_notrace(trans, path, 0)) {
 +              ck = (void *) path->l[0].b;
 +              goto fill;
 +      }
 +retry:
 +      ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
 +      if (!ck) {
 +              return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
 +      } else {
 +              enum six_lock_type lock_want = __btree_lock_want(path, 0);
 +
 +              ret = btree_node_lock(trans, path, (void *) ck, 0,
 +                                    lock_want, _THIS_IP_);
 +              EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
 +
 +              if (ret)
 +                      return ret;
 +
 +              if (ck->key.btree_id != path->btree_id ||
 +                  !bpos_eq(ck->key.pos, path->pos)) {
 +                      six_unlock_type(&ck->c.lock, lock_want);
 +                      goto retry;
 +              }
 +
 +              mark_btree_node_locked(trans, path, 0,
 +                                     (enum btree_node_locked_type) lock_want);
 +      }
 +
 +      path->l[0].lock_seq     = six_lock_seq(&ck->c.lock);
 +      path->l[0].b            = (void *) ck;
 +fill:
 +      if (!ck->valid)
 +              return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
 +
 +      if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
 +              set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 +
 +      path->uptodate = BTREE_ITER_UPTODATE;
 +      EBUG_ON(!ck->valid);
 +      EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
 +
 +      return ret;
 +}
 +
 +static int btree_key_cache_flush_pos(struct btree_trans *trans,
 +                                   struct bkey_cached_key key,
 +                                   u64 journal_seq,
 +                                   unsigned commit_flags,
 +                                   bool evict)
 +{
 +      struct bch_fs *c = trans->c;
 +      struct journal *j = &c->journal;
 +      struct btree_iter c_iter, b_iter;
 +      struct bkey_cached *ck = NULL;
 +      int ret;
 +
 +      bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
 +                           BTREE_ITER_SLOTS|
 +                           BTREE_ITER_INTENT|
 +                           BTREE_ITER_ALL_SNAPSHOTS);
 +      bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
 +                           BTREE_ITER_CACHED|
 +                           BTREE_ITER_INTENT);
 +      b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
 +
 +      ret = bch2_btree_iter_traverse(&c_iter);
 +      if (ret)
 +              goto out;
 +
 +      ck = (void *) c_iter.path->l[0].b;
 +      if (!ck)
 +              goto out;
 +
 +      if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 +              if (evict)
 +                      goto evict;
 +              goto out;
 +      }
 +
 +      BUG_ON(!ck->valid);
 +
 +      if (journal_seq && ck->journal.seq != journal_seq)
 +              goto out;
 +
 +      /*
 +       * Since journal reclaim depends on us making progress here, and the
 +       * allocator/copygc depend on journal reclaim making progress, we need
 +       * to be using alloc reserves:
 +       */
 +      ret   = bch2_btree_iter_traverse(&b_iter) ?:
 +              bch2_trans_update(trans, &b_iter, ck->k,
 +                                BTREE_UPDATE_KEY_CACHE_RECLAIM|
 +                                BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
 +                                BTREE_TRIGGER_NORUN) ?:
 +              bch2_trans_commit(trans, NULL, NULL,
 +                                BTREE_INSERT_NOCHECK_RW|
 +                                BTREE_INSERT_NOFAIL|
 +                                (ck->journal.seq == journal_last_seq(j)
 +                                 ? BCH_WATERMARK_reclaim
 +                                 : 0)|
 +                                commit_flags);
 +
 +      bch2_fs_fatal_err_on(ret &&
 +                           !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
 +                           !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
 +                           !bch2_journal_error(j), c,
 +                           "error flushing key cache: %s", bch2_err_str(ret));
 +      if (ret)
 +              goto out;
 +
 +      bch2_journal_pin_drop(j, &ck->journal);
 +      bch2_journal_preres_put(j, &ck->res);
 +
 +      BUG_ON(!btree_node_locked(c_iter.path, 0));
 +
 +      if (!evict) {
 +              if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 +                      clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
 +                      atomic_long_dec(&c->btree_key_cache.nr_dirty);
 +              }
 +      } else {
 +              struct btree_path *path2;
 +evict:
 +              trans_for_each_path(trans, path2)
 +                      if (path2 != c_iter.path)
 +                              __bch2_btree_path_unlock(trans, path2);
 +
 +              bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c);
 +
 +              if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 +                      clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
 +                      atomic_long_dec(&c->btree_key_cache.nr_dirty);
 +              }
 +
 +              mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED);
 +              bkey_cached_evict(&c->btree_key_cache, ck);
 +              bkey_cached_free_fast(&c->btree_key_cache, ck);
 +      }
 +out:
 +      bch2_trans_iter_exit(trans, &b_iter);
 +      bch2_trans_iter_exit(trans, &c_iter);
 +      return ret;
 +}
 +
 +int bch2_btree_key_cache_journal_flush(struct journal *j,
 +                              struct journal_entry_pin *pin, u64 seq)
 +{
 +      struct bch_fs *c = container_of(j, struct bch_fs, journal);
 +      struct bkey_cached *ck =
 +              container_of(pin, struct bkey_cached, journal);
 +      struct bkey_cached_key key;
 +      struct btree_trans *trans = bch2_trans_get(c);
 +      int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 +      int ret = 0;
 +
 +      btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
 +      key = ck->key;
 +
 +      if (ck->journal.seq != seq ||
 +          !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 +              six_unlock_read(&ck->c.lock);
 +              goto unlock;
 +      }
 +
 +      if (ck->seq != seq) {
 +              bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal,
 +                                      bch2_btree_key_cache_journal_flush);
 +              six_unlock_read(&ck->c.lock);
 +              goto unlock;
 +      }
 +      six_unlock_read(&ck->c.lock);
 +
 +      ret = commit_do(trans, NULL, NULL, 0,
 +              btree_key_cache_flush_pos(trans, key, seq,
 +                              BTREE_INSERT_JOURNAL_RECLAIM, false));
 +unlock:
 +      srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 +
 +      bch2_trans_put(trans);
 +      return ret;
 +}
 +
 +/*
 + * Flush and evict a key from the key cache:
 + */
 +int bch2_btree_key_cache_flush(struct btree_trans *trans,
 +                             enum btree_id id, struct bpos pos)
 +{
 +      struct bch_fs *c = trans->c;
 +      struct bkey_cached_key key = { id, pos };
 +
 +      /* Fastpath - assume it won't be found: */
 +      if (!bch2_btree_key_cache_find(c, id, pos))
 +              return 0;
 +
 +      return btree_key_cache_flush_pos(trans, key, 0, 0, true);
 +}
 +
 +bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 +                                unsigned flags,
 +                                struct btree_insert_entry *insert_entry)
 +{
 +      struct bch_fs *c = trans->c;
 +      struct bkey_cached *ck = (void *) insert_entry->path->l[0].b;
 +      struct bkey_i *insert = insert_entry->k;
 +      bool kick_reclaim = false;
 +
 +      BUG_ON(insert->k.u64s > ck->u64s);
 +
 +      if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
 +              int difference;
 +
 +              BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s);
 +
 +              difference = jset_u64s(insert->k.u64s) - ck->res.u64s;
 +              if (difference > 0) {
 +                      trans->journal_preres.u64s      -= difference;
 +                      ck->res.u64s                    += difference;
 +              }
 +      }
 +
 +      bkey_copy(ck->k, insert);
 +      ck->valid = true;
 +
 +      if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 +              EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
 +              set_bit(BKEY_CACHED_DIRTY, &ck->flags);
 +              atomic_long_inc(&c->btree_key_cache.nr_dirty);
 +
 +              if (bch2_nr_btree_keys_need_flush(c))
 +                      kick_reclaim = true;
 +      }
 +
 +      /*
 +       * To minimize lock contention, we only add the journal pin here and
 +       * defer pin updates to the flush callback via ->seq. Be careful not to
 +       * update ->seq on nojournal commits because we don't want to update the
 +       * pin to a seq that doesn't include journal updates on disk. Otherwise
 +       * we risk losing the update after a crash.
 +       *
 +       * The only exception is if the pin is not active in the first place. We
 +       * have to add the pin because journal reclaim drives key cache
 +       * flushing. The flush callback will not proceed unless ->seq matches
 +       * the latest pin, so make sure it starts with a consistent value.
 +       */
 +      if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) ||
 +          !journal_pin_active(&ck->journal)) {
 +              ck->seq = trans->journal_res.seq;
 +      }
 +      bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
 +                           &ck->journal, bch2_btree_key_cache_journal_flush);
 +
 +      if (kick_reclaim)
 +              journal_reclaim_kick(&c->journal);
 +      return true;
 +}
 +
 +void bch2_btree_key_cache_drop(struct btree_trans *trans,
 +                             struct btree_path *path)
 +{
 +      struct bch_fs *c = trans->c;
 +      struct bkey_cached *ck = (void *) path->l[0].b;
 +
 +      BUG_ON(!ck->valid);
 +
 +      /*
 +       * We just did an update to the btree, bypassing the key cache: the key
 +       * cache key is now stale and must be dropped, even if dirty:
 +       */
 +      if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 +              clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
 +              atomic_long_dec(&c->btree_key_cache.nr_dirty);
 +              bch2_journal_pin_drop(&c->journal, &ck->journal);
 +      }
 +
 +      ck->valid = false;
 +}
 +
 +static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 +                                         struct shrink_control *sc)
 +{
-       struct bch_fs *c = container_of(shrink, struct bch_fs,
-                                       btree_key_cache.shrink);
++      struct bch_fs *c = shrink->private_data;
 +      struct btree_key_cache *bc = &c->btree_key_cache;
 +      struct bucket_table *tbl;
 +      struct bkey_cached *ck, *t;
 +      size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
 +      unsigned start, flags;
 +      int srcu_idx;
 +
 +      mutex_lock(&bc->lock);
 +      srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 +      flags = memalloc_nofs_save();
 +
 +      /*
 +       * Newest freed entries are at the end of the list - once we hit one
 +       * that's too new to be freed, we can bail out:
 +       */
 +      list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
 +              if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
 +                                               ck->btree_trans_barrier_seq))
 +                      break;
 +
 +              list_del(&ck->list);
 +              six_lock_exit(&ck->c.lock);
 +              kmem_cache_free(bch2_key_cache, ck);
 +              atomic_long_dec(&bc->nr_freed);
 +              scanned++;
 +              freed++;
 +      }
 +
 +      if (scanned >= nr)
 +              goto out;
 +
 +      list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
 +              if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
 +                                               ck->btree_trans_barrier_seq))
 +                      break;
 +
 +              list_del(&ck->list);
 +              six_lock_exit(&ck->c.lock);
 +              kmem_cache_free(bch2_key_cache, ck);
 +              atomic_long_dec(&bc->nr_freed);
 +              scanned++;
 +              freed++;
 +      }
 +
 +      if (scanned >= nr)
 +              goto out;
 +
 +      rcu_read_lock();
 +      tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
 +      if (bc->shrink_iter >= tbl->size)
 +              bc->shrink_iter = 0;
 +      start = bc->shrink_iter;
 +
 +      do {
 +              struct rhash_head *pos, *next;
 +
 +              pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
 +
 +              while (!rht_is_a_nulls(pos)) {
 +                      next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
 +                      ck = container_of(pos, struct bkey_cached, hash);
 +
 +                      if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
 +                              goto next;
 +
 +                      if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
 +                              clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 +                      else if (bkey_cached_lock_for_evict(ck)) {
 +                              bkey_cached_evict(bc, ck);
 +                              bkey_cached_free(bc, ck);
 +                      }
 +
 +                      scanned++;
 +                      if (scanned >= nr)
 +                              break;
 +next:
 +                      pos = next;
 +              }
 +
 +              bc->shrink_iter++;
 +              if (bc->shrink_iter >= tbl->size)
 +                      bc->shrink_iter = 0;
 +      } while (scanned < nr && bc->shrink_iter != start);
 +
 +      rcu_read_unlock();
 +out:
 +      memalloc_nofs_restore(flags);
 +      srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 +      mutex_unlock(&bc->lock);
 +
 +      return freed;
 +}
 +
 +static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
 +                                          struct shrink_control *sc)
 +{
-       unregister_shrinker(&bc->shrink);
++      struct bch_fs *c = shrink->private_data;
 +      struct btree_key_cache *bc = &c->btree_key_cache;
 +      long nr = atomic_long_read(&bc->nr_keys) -
 +              atomic_long_read(&bc->nr_dirty);
 +
 +      return max(0L, nr);
 +}
 +
 +void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 +{
 +      struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 +      struct bucket_table *tbl;
 +      struct bkey_cached *ck, *n;
 +      struct rhash_head *pos;
 +      LIST_HEAD(items);
 +      unsigned i;
 +#ifdef __KERNEL__
 +      int cpu;
 +#endif
 +
-       bc->shrink.seeks                = 0;
-       bc->shrink.count_objects        = bch2_btree_key_cache_count;
-       bc->shrink.scan_objects         = bch2_btree_key_cache_scan;
-       if (register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name))
++      shrinker_free(bc->shrink);
 +
 +      mutex_lock(&bc->lock);
 +
 +      /*
 +       * The loop is needed to guard against racing with rehash:
 +       */
 +      while (atomic_long_read(&bc->nr_keys)) {
 +              rcu_read_lock();
 +              tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
 +              if (tbl)
 +                      for (i = 0; i < tbl->size; i++)
 +                              rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
 +                                      bkey_cached_evict(bc, ck);
 +                                      list_add(&ck->list, &items);
 +                              }
 +              rcu_read_unlock();
 +      }
 +
 +#ifdef __KERNEL__
 +      for_each_possible_cpu(cpu) {
 +              struct btree_key_cache_freelist *f =
 +                      per_cpu_ptr(bc->pcpu_freed, cpu);
 +
 +              for (i = 0; i < f->nr; i++) {
 +                      ck = f->objs[i];
 +                      list_add(&ck->list, &items);
 +              }
 +      }
 +#endif
 +
 +      list_splice(&bc->freed_pcpu,    &items);
 +      list_splice(&bc->freed_nonpcpu, &items);
 +
 +      mutex_unlock(&bc->lock);
 +
 +      list_for_each_entry_safe(ck, n, &items, list) {
 +              cond_resched();
 +
 +              bch2_journal_pin_drop(&c->journal, &ck->journal);
 +              bch2_journal_preres_put(&c->journal, &ck->res);
 +
 +              list_del(&ck->list);
 +              kfree(ck->k);
 +              six_lock_exit(&ck->c.lock);
 +              kmem_cache_free(bch2_key_cache, ck);
 +      }
 +
 +      if (atomic_long_read(&bc->nr_dirty) &&
 +          !bch2_journal_error(&c->journal) &&
 +          test_bit(BCH_FS_WAS_RW, &c->flags))
 +              panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
 +                    atomic_long_read(&bc->nr_dirty));
 +
 +      if (atomic_long_read(&bc->nr_keys))
 +              panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
 +                    atomic_long_read(&bc->nr_keys));
 +
 +      if (bc->table_init_done)
 +              rhashtable_destroy(&bc->table);
 +
 +      free_percpu(bc->pcpu_freed);
 +}
 +
 +void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
 +{
 +      mutex_init(&c->lock);
 +      INIT_LIST_HEAD(&c->freed_pcpu);
 +      INIT_LIST_HEAD(&c->freed_nonpcpu);
 +}
 +
 +int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 +{
 +      struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
++      struct shrinker *shrink;
 +
 +#ifdef __KERNEL__
 +      bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
 +      if (!bc->pcpu_freed)
 +              return -BCH_ERR_ENOMEM_fs_btree_cache_init;
 +#endif
 +
 +      if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
 +              return -BCH_ERR_ENOMEM_fs_btree_cache_init;
 +
 +      bc->table_init_done = true;
 +
++      shrink = shrinker_alloc(0, "%s/btree_key_cache", c->name);
++      if (!shrink)
 +              return -BCH_ERR_ENOMEM_fs_btree_cache_init;
++      bc->shrink = shrink;
++      shrink->seeks           = 0;
++      shrink->count_objects   = bch2_btree_key_cache_count;
++      shrink->scan_objects    = bch2_btree_key_cache_scan;
++      shrink->private_data    = c;
++      shrinker_register(shrink);
 +      return 0;
 +}
 +
 +void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 +{
 +      prt_printf(out, "nr_freed:\t%lu",       atomic_long_read(&c->nr_freed));
 +      prt_newline(out);
 +      prt_printf(out, "nr_keys:\t%lu",        atomic_long_read(&c->nr_keys));
 +      prt_newline(out);
 +      prt_printf(out, "nr_dirty:\t%lu",       atomic_long_read(&c->nr_dirty));
 +      prt_newline(out);
 +}
 +
 +void bch2_btree_key_cache_exit(void)
 +{
 +      kmem_cache_destroy(bch2_key_cache);
 +}
 +
 +int __init bch2_btree_key_cache_init(void)
 +{
 +      bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT);
 +      if (!bch2_key_cache)
 +              return -ENOMEM;
 +
 +      return 0;
 +}
index c9a38e254949ec2b4aa400d253a52e099c1b0292,0000000000000000000000000000000000000000..bc6714d88925f3183ed9d817f8a79f106e55da7c
mode 100644,000000..100644
--- /dev/null
@@@ -1,739 -1,0 +1,739 @@@
-       struct shrinker         shrink;
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_TYPES_H
 +#define _BCACHEFS_BTREE_TYPES_H
 +
 +#include <linux/list.h>
 +#include <linux/rhashtable.h>
 +
 +//#include "bkey_methods.h"
 +#include "buckets_types.h"
 +#include "darray.h"
 +#include "errcode.h"
 +#include "journal_types.h"
 +#include "replicas_types.h"
 +#include "six.h"
 +
 +struct open_bucket;
 +struct btree_update;
 +struct btree_trans;
 +
 +#define MAX_BSETS             3U
 +
 +struct btree_nr_keys {
 +
 +      /*
 +       * Amount of live metadata (i.e. size of node after a compaction) in
 +       * units of u64s
 +       */
 +      u16                     live_u64s;
 +      u16                     bset_u64s[MAX_BSETS];
 +
 +      /* live keys only: */
 +      u16                     packed_keys;
 +      u16                     unpacked_keys;
 +};
 +
 +struct bset_tree {
 +      /*
 +       * We construct a binary tree in an array as if the array
 +       * started at 1, so that things line up on the same cachelines
 +       * better: see comments in bset.c at cacheline_to_bkey() for
 +       * details
 +       */
 +
 +      /* size of the binary tree and prev array */
 +      u16                     size;
 +
 +      /* function of size - precalculated for to_inorder() */
 +      u16                     extra;
 +
 +      u16                     data_offset;
 +      u16                     aux_data_offset;
 +      u16                     end_offset;
 +};
 +
 +struct btree_write {
 +      struct journal_entry_pin        journal;
 +};
 +
 +struct btree_alloc {
 +      struct open_buckets     ob;
 +      __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
 +};
 +
 +struct btree_bkey_cached_common {
 +      struct six_lock         lock;
 +      u8                      level;
 +      u8                      btree_id;
 +      bool                    cached;
 +};
 +
 +struct btree {
 +      struct btree_bkey_cached_common c;
 +
 +      struct rhash_head       hash;
 +      u64                     hash_val;
 +
 +      unsigned long           flags;
 +      u16                     written;
 +      u8                      nsets;
 +      u8                      nr_key_bits;
 +      u16                     version_ondisk;
 +
 +      struct bkey_format      format;
 +
 +      struct btree_node       *data;
 +      void                    *aux_data;
 +
 +      /*
 +       * Sets of sorted keys - the real btree node - plus a binary search tree
 +       *
 +       * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
 +       * to the memory we have allocated for this btree node. Additionally,
 +       * set[0]->data points to the entire btree node as it exists on disk.
 +       */
 +      struct bset_tree        set[MAX_BSETS];
 +
 +      struct btree_nr_keys    nr;
 +      u16                     sib_u64s[2];
 +      u16                     whiteout_u64s;
 +      u8                      byte_order;
 +      u8                      unpack_fn_len;
 +
 +      struct btree_write      writes[2];
 +
 +      /* Key/pointer for this btree node */
 +      __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 +
 +      /*
 +       * XXX: add a delete sequence number, so when bch2_btree_node_relock()
 +       * fails because the lock sequence number has changed - i.e. the
 +       * contents were modified - we can still relock the node if it's still
 +       * the one we want, without redoing the traversal
 +       */
 +
 +      /*
 +       * For asynchronous splits/interior node updates:
 +       * When we do a split, we allocate new child nodes and update the parent
 +       * node to point to them: we update the parent in memory immediately,
 +       * but then we must wait until the children have been written out before
 +       * the update to the parent can be written - this is a list of the
 +       * btree_updates that are blocking this node from being
 +       * written:
 +       */
 +      struct list_head        write_blocked;
 +
 +      /*
 +       * Also for asynchronous splits/interior node updates:
 +       * If a btree node isn't reachable yet, we don't want to kick off
 +       * another write - because that write also won't yet be reachable and
 +       * marking it as completed before it's reachable would be incorrect:
 +       */
 +      unsigned long           will_make_reachable;
 +
 +      struct open_buckets     ob;
 +
 +      /* lru list */
 +      struct list_head        list;
 +};
 +
 +struct btree_cache {
 +      struct rhashtable       table;
 +      bool                    table_init_done;
 +      /*
 +       * We never free a struct btree, except on shutdown - we just put it on
 +       * the btree_cache_freed list and reuse it later. This simplifies the
 +       * code, and it doesn't cost us much memory as the memory usage is
 +       * dominated by buffers that hold the actual btree node data and those
 +       * can be freed - and the number of struct btrees allocated is
 +       * effectively bounded.
 +       *
 +       * btree_cache_freeable effectively is a small cache - we use it because
 +       * high order page allocations can be rather expensive, and it's quite
 +       * common to delete and allocate btree nodes in quick succession. It
 +       * should never grow past ~2-3 nodes in practice.
 +       */
 +      struct mutex            lock;
 +      struct list_head        live;
 +      struct list_head        freeable;
 +      struct list_head        freed_pcpu;
 +      struct list_head        freed_nonpcpu;
 +
 +      /* Number of elements in live + freeable lists */
 +      unsigned                used;
 +      unsigned                reserve;
 +      atomic_t                dirty;
-       struct shrinker         shrink;
++      struct shrinker         *shrink;
 +
 +      /*
 +       * If we need to allocate memory for a new btree node and that
 +       * allocation fails, we can cannibalize another node in the btree cache
 +       * to satisfy the allocation - lock to guarantee only one thread does
 +       * this at a time:
 +       */
 +      struct task_struct      *alloc_lock;
 +      struct closure_waitlist alloc_wait;
 +};
 +
 +struct btree_node_iter {
 +      struct btree_node_iter_set {
 +              u16     k, end;
 +      } data[MAX_BSETS];
 +};
 +
 +/*
 + * Iterate over all possible positions, synthesizing deleted keys for holes:
 + */
 +static const __maybe_unused u16 BTREE_ITER_SLOTS              = 1 << 0;
 +static const __maybe_unused u16 BTREE_ITER_ALL_LEVELS         = 1 << 1;
 +/*
 + * Indicates that intent locks should be taken on leaf nodes, because we expect
 + * to be doing updates:
 + */
 +static const __maybe_unused u16 BTREE_ITER_INTENT             = 1 << 2;
 +/*
 + * Causes the btree iterator code to prefetch additional btree nodes from disk:
 + */
 +static const __maybe_unused u16 BTREE_ITER_PREFETCH           = 1 << 3;
 +/*
 + * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
 + * @pos or the first key strictly greater than @pos
 + */
 +static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS         = 1 << 4;
 +static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS                = 1 << 5;
 +static const __maybe_unused u16 BTREE_ITER_CACHED             = 1 << 6;
 +static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE     = 1 << 7;
 +static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES               = 1 << 8;
 +static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL               = 1 << 9;
 +static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS    = 1 << 10;
 +static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS      = 1 << 11;
 +static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS   = 1 << 12;
 +static const __maybe_unused u16 BTREE_ITER_NOPRESERVE         = 1 << 13;
 +static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL      = 1 << 14;
 +static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL     = 1 << 15;
 +#define __BTREE_ITER_FLAGS_END                                               16
 +
 +enum btree_path_uptodate {
 +      BTREE_ITER_UPTODATE             = 0,
 +      BTREE_ITER_NEED_RELOCK          = 1,
 +      BTREE_ITER_NEED_TRAVERSE        = 2,
 +};
 +
 +#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG)
 +#define TRACK_PATH_ALLOCATED
 +#endif
 +
 +struct btree_path {
 +      u8                      idx;
 +      u8                      sorted_idx;
 +      u8                      ref;
 +      u8                      intent_ref;
 +
 +      /* btree_iter_copy starts here: */
 +      struct bpos             pos;
 +
 +      enum btree_id           btree_id:5;
 +      bool                    cached:1;
 +      bool                    preserve:1;
 +      enum btree_path_uptodate uptodate:2;
 +      /*
 +       * When true, failing to relock this path will cause the transaction to
 +       * restart:
 +       */
 +      bool                    should_be_locked:1;
 +      unsigned                level:3,
 +                              locks_want:3;
 +      u8                      nodes_locked;
 +
 +      struct btree_path_level {
 +              struct btree    *b;
 +              struct btree_node_iter iter;
 +              u32             lock_seq;
 +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
 +              u64             lock_taken_time;
 +#endif
 +      }                       l[BTREE_MAX_DEPTH];
 +#ifdef TRACK_PATH_ALLOCATED
 +      unsigned long           ip_allocated;
 +#endif
 +};
 +
 +static inline struct btree_path_level *path_l(struct btree_path *path)
 +{
 +      return path->l + path->level;
 +}
 +
 +static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
 +{
 +#ifdef TRACK_PATH_ALLOCATED
 +      return path->ip_allocated;
 +#else
 +      return _THIS_IP_;
 +#endif
 +}
 +
 +/*
 + * @pos                       - iterator's current position
 + * @level             - current btree depth
 + * @locks_want                - btree level below which we start taking intent locks
 + * @nodes_locked      - bitmask indicating which nodes in @nodes are locked
 + * @nodes_intent_locked       - bitmask indicating which locks are intent locks
 + */
 +struct btree_iter {
 +      struct btree_trans      *trans;
 +      struct btree_path       *path;
 +      struct btree_path       *update_path;
 +      struct btree_path       *key_cache_path;
 +
 +      enum btree_id           btree_id:8;
 +      unsigned                min_depth:3;
 +      unsigned                advanced:1;
 +
 +      /* btree_iter_copy starts here: */
 +      u16                     flags;
 +
 +      /* When we're filtering by snapshot, the snapshot ID we're looking for: */
 +      unsigned                snapshot;
 +
 +      struct bpos             pos;
 +      /*
 +       * Current unpacked key - so that bch2_btree_iter_next()/
 +       * bch2_btree_iter_next_slot() can correctly advance pos.
 +       */
 +      struct bkey             k;
 +
 +      /* BTREE_ITER_WITH_JOURNAL: */
 +      size_t                  journal_idx;
 +      struct bpos             journal_pos;
 +#ifdef TRACK_PATH_ALLOCATED
 +      unsigned long           ip_allocated;
 +#endif
 +};
 +
 +struct btree_key_cache_freelist {
 +      struct bkey_cached      *objs[16];
 +      unsigned                nr;
 +};
 +
 +struct btree_key_cache {
 +      struct mutex            lock;
 +      struct rhashtable       table;
 +      bool                    table_init_done;
 +      struct list_head        freed_pcpu;
 +      struct list_head        freed_nonpcpu;
++      struct shrinker         *shrink;
 +      unsigned                shrink_iter;
 +      struct btree_key_cache_freelist __percpu *pcpu_freed;
 +
 +      atomic_long_t           nr_freed;
 +      atomic_long_t           nr_keys;
 +      atomic_long_t           nr_dirty;
 +};
 +
 +struct bkey_cached_key {
 +      u32                     btree_id;
 +      struct bpos             pos;
 +} __packed __aligned(4);
 +
 +#define BKEY_CACHED_ACCESSED          0
 +#define BKEY_CACHED_DIRTY             1
 +
 +struct bkey_cached {
 +      struct btree_bkey_cached_common c;
 +
 +      unsigned long           flags;
 +      u16                     u64s;
 +      bool                    valid;
 +      u32                     btree_trans_barrier_seq;
 +      struct bkey_cached_key  key;
 +
 +      struct rhash_head       hash;
 +      struct list_head        list;
 +
 +      struct journal_preres   res;
 +      struct journal_entry_pin journal;
 +      u64                     seq;
 +
 +      struct bkey_i           *k;
 +};
 +
 +static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
 +{
 +      return !b->cached
 +              ? container_of(b, struct btree, c)->key.k.p
 +              : container_of(b, struct bkey_cached, c)->key.pos;
 +}
 +
 +struct btree_insert_entry {
 +      unsigned                flags;
 +      u8                      bkey_type;
 +      enum btree_id           btree_id:8;
 +      u8                      level:4;
 +      bool                    cached:1;
 +      bool                    insert_trigger_run:1;
 +      bool                    overwrite_trigger_run:1;
 +      bool                    key_cache_already_flushed:1;
 +      /*
 +       * @old_k may be a key from the journal; @old_btree_u64s always refers
 +       * to the size of the key being overwritten in the btree:
 +       */
 +      u8                      old_btree_u64s;
 +      struct bkey_i           *k;
 +      struct btree_path       *path;
 +      u64                     seq;
 +      /* key being overwritten: */
 +      struct bkey             old_k;
 +      const struct bch_val    *old_v;
 +      unsigned long           ip_allocated;
 +};
 +
 +#ifndef CONFIG_LOCKDEP
 +#define BTREE_ITER_MAX                64
 +#else
 +#define BTREE_ITER_MAX                32
 +#endif
 +
 +struct btree_trans_commit_hook;
 +typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
 +
 +struct btree_trans_commit_hook {
 +      btree_trans_commit_hook_fn      *fn;
 +      struct btree_trans_commit_hook  *next;
 +};
 +
 +#define BTREE_TRANS_MEM_MAX   (1U << 16)
 +
 +#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS     10000
 +
 +struct btree_trans {
 +      struct bch_fs           *c;
 +      const char              *fn;
 +      struct closure          ref;
 +      struct list_head        list;
 +      u64                     last_begin_time;
 +
 +      u8                      lock_may_not_fail;
 +      u8                      lock_must_abort;
 +      struct btree_bkey_cached_common *locking;
 +      struct six_lock_waiter  locking_wait;
 +
 +      int                     srcu_idx;
 +
 +      u8                      fn_idx;
 +      u8                      nr_sorted;
 +      u8                      nr_updates;
 +      u8                      nr_wb_updates;
 +      u8                      wb_updates_size;
 +      bool                    used_mempool:1;
 +      bool                    in_traverse_all:1;
 +      bool                    paths_sorted:1;
 +      bool                    memory_allocation_failure:1;
 +      bool                    journal_transaction_names:1;
 +      bool                    journal_replay_not_finished:1;
 +      bool                    notrace_relock_fail:1;
 +      enum bch_errcode        restarted:16;
 +      u32                     restart_count;
 +      unsigned long           last_begin_ip;
 +      unsigned long           last_restarted_ip;
 +      unsigned long           srcu_lock_time;
 +
 +      /*
 +       * For when bch2_trans_update notices we'll be splitting a compressed
 +       * extent:
 +       */
 +      unsigned                extra_journal_res;
 +      unsigned                nr_max_paths;
 +
 +      u64                     paths_allocated;
 +
 +      unsigned                mem_top;
 +      unsigned                mem_max;
 +      unsigned                mem_bytes;
 +      void                    *mem;
 +
 +      u8                      sorted[BTREE_ITER_MAX + 8];
 +      struct btree_path       paths[BTREE_ITER_MAX];
 +      struct btree_insert_entry updates[BTREE_ITER_MAX];
 +      struct btree_write_buffered_key *wb_updates;
 +
 +      /* update path: */
 +      struct btree_trans_commit_hook *hooks;
 +      darray_u64              extra_journal_entries;
 +      struct journal_entry_pin *journal_pin;
 +
 +      struct journal_res      journal_res;
 +      struct journal_preres   journal_preres;
 +      u64                     *journal_seq;
 +      struct disk_reservation *disk_res;
 +      unsigned                journal_u64s;
 +      unsigned                journal_preres_u64s;
 +      struct replicas_delta_list *fs_usage_deltas;
 +};
 +
 +#define BCH_BTREE_WRITE_TYPES()                                               \
 +      x(initial,              0)                                      \
 +      x(init_next_bset,       1)                                      \
 +      x(cache_reclaim,        2)                                      \
 +      x(journal_reclaim,      3)                                      \
 +      x(interior,             4)
 +
 +enum btree_write_type {
 +#define x(t, n) BTREE_WRITE_##t,
 +      BCH_BTREE_WRITE_TYPES()
 +#undef x
 +      BTREE_WRITE_TYPE_NR,
 +};
 +
 +#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
 +#define BTREE_WRITE_TYPE_BITS ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR))
 +
 +#define BTREE_FLAGS()                                                 \
 +      x(read_in_flight)                                               \
 +      x(read_error)                                                   \
 +      x(dirty)                                                        \
 +      x(need_write)                                                   \
 +      x(write_blocked)                                                \
 +      x(will_make_reachable)                                          \
 +      x(noevict)                                                      \
 +      x(write_idx)                                                    \
 +      x(accessed)                                                     \
 +      x(write_in_flight)                                              \
 +      x(write_in_flight_inner)                                        \
 +      x(just_written)                                                 \
 +      x(dying)                                                        \
 +      x(fake)                                                         \
 +      x(need_rewrite)                                                 \
 +      x(never_write)
 +
 +enum btree_flags {
 +      /* First bits for btree node write type */
 +      BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1,
 +#define x(flag)       BTREE_NODE_##flag,
 +      BTREE_FLAGS()
 +#undef x
 +};
 +
 +#define x(flag)                                                               \
 +static inline bool btree_node_ ## flag(struct btree *b)                       \
 +{     return test_bit(BTREE_NODE_ ## flag, &b->flags); }              \
 +                                                                      \
 +static inline void set_btree_node_ ## flag(struct btree *b)           \
 +{     set_bit(BTREE_NODE_ ## flag, &b->flags); }                      \
 +                                                                      \
 +static inline void clear_btree_node_ ## flag(struct btree *b)         \
 +{     clear_bit(BTREE_NODE_ ## flag, &b->flags); }
 +
 +BTREE_FLAGS()
 +#undef x
 +
 +static inline struct btree_write *btree_current_write(struct btree *b)
 +{
 +      return b->writes + btree_node_write_idx(b);
 +}
 +
 +static inline struct btree_write *btree_prev_write(struct btree *b)
 +{
 +      return b->writes + (btree_node_write_idx(b) ^ 1);
 +}
 +
 +static inline struct bset_tree *bset_tree_last(struct btree *b)
 +{
 +      EBUG_ON(!b->nsets);
 +      return b->set + b->nsets - 1;
 +}
 +
 +static inline void *
 +__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
 +{
 +      return (void *) ((u64 *) b->data + 1 + offset);
 +}
 +
 +static inline u16
 +__btree_node_ptr_to_offset(const struct btree *b, const void *p)
 +{
 +      u16 ret = (u64 *) p - 1 - (u64 *) b->data;
 +
 +      EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
 +      return ret;
 +}
 +
 +static inline struct bset *bset(const struct btree *b,
 +                              const struct bset_tree *t)
 +{
 +      return __btree_node_offset_to_ptr(b, t->data_offset);
 +}
 +
 +static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
 +{
 +      t->end_offset =
 +              __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
 +}
 +
 +static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
 +                                const struct bset *i)
 +{
 +      t->data_offset = __btree_node_ptr_to_offset(b, i);
 +      set_btree_bset_end(b, t);
 +}
 +
 +static inline struct bset *btree_bset_first(struct btree *b)
 +{
 +      return bset(b, b->set);
 +}
 +
 +static inline struct bset *btree_bset_last(struct btree *b)
 +{
 +      return bset(b, bset_tree_last(b));
 +}
 +
 +static inline u16
 +__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
 +{
 +      return __btree_node_ptr_to_offset(b, k);
 +}
 +
 +static inline struct bkey_packed *
 +__btree_node_offset_to_key(const struct btree *b, u16 k)
 +{
 +      return __btree_node_offset_to_ptr(b, k);
 +}
 +
 +static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
 +{
 +      return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
 +}
 +
 +#define btree_bkey_first(_b, _t)                                      \
 +({                                                                    \
 +      EBUG_ON(bset(_b, _t)->start !=                                  \
 +              __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
 +                                                                      \
 +      bset(_b, _t)->start;                                            \
 +})
 +
 +#define btree_bkey_last(_b, _t)                                               \
 +({                                                                    \
 +      EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) !=     \
 +              vstruct_last(bset(_b, _t)));                            \
 +                                                                      \
 +      __btree_node_offset_to_key(_b, (_t)->end_offset);               \
 +})
 +
 +static inline unsigned bset_u64s(struct bset_tree *t)
 +{
 +      return t->end_offset - t->data_offset -
 +              sizeof(struct bset) / sizeof(u64);
 +}
 +
 +static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t)
 +{
 +      return bset_u64s(t) - b->nr.bset_u64s[t - b->set];
 +}
 +
 +static inline unsigned bset_byte_offset(struct btree *b, void *i)
 +{
 +      return i - (void *) b->data;
 +}
 +
 +enum btree_node_type {
 +#define x(kwd, val, ...) BKEY_TYPE_##kwd = val,
 +      BCH_BTREE_IDS()
 +#undef x
 +      BKEY_TYPE_btree,
 +};
 +
 +/* Type of a key in btree @id at level @level: */
 +static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
 +{
 +      return level ? BKEY_TYPE_btree : (enum btree_node_type) id;
 +}
 +
 +/* Type of keys @b contains: */
 +static inline enum btree_node_type btree_node_type(struct btree *b)
 +{
 +      return __btree_node_type(b->c.level, b->c.btree_id);
 +}
 +
 +#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS            \
 +      (BIT(BKEY_TYPE_extents)|                        \
 +       BIT(BKEY_TYPE_alloc)|                          \
 +       BIT(BKEY_TYPE_inodes)|                         \
 +       BIT(BKEY_TYPE_stripes)|                        \
 +       BIT(BKEY_TYPE_reflink)|                        \
 +       BIT(BKEY_TYPE_btree))
 +
 +#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS              \
 +      (BIT(BKEY_TYPE_alloc)|                          \
 +       BIT(BKEY_TYPE_inodes)|                         \
 +       BIT(BKEY_TYPE_stripes)|                        \
 +       BIT(BKEY_TYPE_snapshots))
 +
 +#define BTREE_NODE_TYPE_HAS_TRIGGERS                  \
 +      (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|            \
 +       BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
 +
 +static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 +{
 +      return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
 +}
 +
 +static inline bool btree_node_type_is_extents(enum btree_node_type type)
 +{
 +      const unsigned mask = 0
 +#define x(name, nr, flags, ...)       |((!!((flags) & BTREE_ID_EXTENTS)) << nr)
 +      BCH_BTREE_IDS()
 +#undef x
 +      ;
 +
 +      return (1U << type) & mask;
 +}
 +
 +static inline bool btree_id_is_extents(enum btree_id btree)
 +{
 +      return btree_node_type_is_extents((enum btree_node_type) btree);
 +}
 +
 +static inline bool btree_type_has_snapshots(enum btree_id id)
 +{
 +      const unsigned mask = 0
 +#define x(name, nr, flags, ...)       |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
 +      BCH_BTREE_IDS()
 +#undef x
 +      ;
 +
 +      return (1U << id) & mask;
 +}
 +
 +static inline bool btree_type_has_ptrs(enum btree_id id)
 +{
 +      const unsigned mask = 0
 +#define x(name, nr, flags, ...)       |((!!((flags) & BTREE_ID_DATA)) << nr)
 +      BCH_BTREE_IDS()
 +#undef x
 +      ;
 +
 +      return (1U << id) & mask;
 +}
 +
 +struct btree_root {
 +      struct btree            *b;
 +
 +      /* On disk root - see async splits: */
 +      __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 +      u8                      level;
 +      u8                      alive;
 +      s8                      error;
 +};
 +
 +enum btree_gc_coalesce_fail_reason {
 +      BTREE_GC_COALESCE_FAIL_RESERVE_GET,
 +      BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
 +      BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
 +};
 +
 +enum btree_node_sibling {
 +      btree_prev_sib,
 +      btree_next_sib,
 +};
 +
 +#endif /* _BCACHEFS_BTREE_TYPES_H */
index 6642b88c41a0e27379d189372d9a369d282c338d,0000000000000000000000000000000000000000..a2a5133fb6b5aec8a3023dacf24c717d416c745f
mode 100644,000000..100644
--- /dev/null
@@@ -1,1980 -1,0 +1,1980 @@@
-       sb->s_shrink.seeks = 0;
 +// SPDX-License-Identifier: GPL-2.0
 +#ifndef NO_BCACHEFS_FS
 +
 +#include "bcachefs.h"
 +#include "acl.h"
 +#include "bkey_buf.h"
 +#include "btree_update.h"
 +#include "buckets.h"
 +#include "chardev.h"
 +#include "dirent.h"
 +#include "errcode.h"
 +#include "extents.h"
 +#include "fs.h"
 +#include "fs-common.h"
 +#include "fs-io.h"
 +#include "fs-ioctl.h"
 +#include "fs-io-buffered.h"
 +#include "fs-io-direct.h"
 +#include "fs-io-pagecache.h"
 +#include "fsck.h"
 +#include "inode.h"
 +#include "io_read.h"
 +#include "journal.h"
 +#include "keylist.h"
 +#include "quota.h"
 +#include "snapshot.h"
 +#include "super.h"
 +#include "xattr.h"
 +
 +#include <linux/aio.h>
 +#include <linux/backing-dev.h>
 +#include <linux/exportfs.h>
 +#include <linux/fiemap.h>
 +#include <linux/module.h>
 +#include <linux/pagemap.h>
 +#include <linux/posix_acl.h>
 +#include <linux/random.h>
 +#include <linux/seq_file.h>
 +#include <linux/statfs.h>
 +#include <linux/string.h>
 +#include <linux/xattr.h>
 +
 +static struct kmem_cache *bch2_inode_cache;
 +
 +static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
 +                              struct bch_inode_info *,
 +                              struct bch_inode_unpacked *,
 +                              struct bch_subvolume *);
 +
 +void bch2_inode_update_after_write(struct btree_trans *trans,
 +                                 struct bch_inode_info *inode,
 +                                 struct bch_inode_unpacked *bi,
 +                                 unsigned fields)
 +{
 +      struct bch_fs *c = trans->c;
 +
 +      BUG_ON(bi->bi_inum != inode->v.i_ino);
 +
 +      bch2_assert_pos_locked(trans, BTREE_ID_inodes,
 +                             POS(0, bi->bi_inum),
 +                             c->opts.inodes_use_key_cache);
 +
 +      set_nlink(&inode->v, bch2_inode_nlink_get(bi));
 +      i_uid_write(&inode->v, bi->bi_uid);
 +      i_gid_write(&inode->v, bi->bi_gid);
 +      inode->v.i_mode = bi->bi_mode;
 +
 +      if (fields & ATTR_ATIME)
 +              inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
 +      if (fields & ATTR_MTIME)
 +              inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
 +      if (fields & ATTR_CTIME)
 +              inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
 +
 +      inode->ei_inode         = *bi;
 +
 +      bch2_inode_flags_to_vfs(inode);
 +}
 +
 +int __must_check bch2_write_inode(struct bch_fs *c,
 +                                struct bch_inode_info *inode,
 +                                inode_set_fn set,
 +                                void *p, unsigned fields)
 +{
 +      struct btree_trans *trans = bch2_trans_get(c);
 +      struct btree_iter iter = { NULL };
 +      struct bch_inode_unpacked inode_u;
 +      int ret;
 +retry:
 +      bch2_trans_begin(trans);
 +
 +      ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
 +                              BTREE_ITER_INTENT) ?:
 +              (set ? set(trans, inode, &inode_u, p) : 0) ?:
 +              bch2_inode_write(trans, &iter, &inode_u) ?:
 +              bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
 +
 +      /*
 +       * the btree node lock protects inode->ei_inode, not ei_update_lock;
 +       * this is important for inode updates via bchfs_write_index_update
 +       */
 +      if (!ret)
 +              bch2_inode_update_after_write(trans, inode, &inode_u, fields);
 +
 +      bch2_trans_iter_exit(trans, &iter);
 +
 +      if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +              goto retry;
 +
 +      bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
 +                           "inode %u:%llu not found when updating",
 +                           inode_inum(inode).subvol,
 +                           inode_inum(inode).inum);
 +
 +      bch2_trans_put(trans);
 +      return ret < 0 ? ret : 0;
 +}
 +
 +int bch2_fs_quota_transfer(struct bch_fs *c,
 +                         struct bch_inode_info *inode,
 +                         struct bch_qid new_qid,
 +                         unsigned qtypes,
 +                         enum quota_acct_mode mode)
 +{
 +      unsigned i;
 +      int ret;
 +
 +      qtypes &= enabled_qtypes(c);
 +
 +      for (i = 0; i < QTYP_NR; i++)
 +              if (new_qid.q[i] == inode->ei_qid.q[i])
 +                      qtypes &= ~(1U << i);
 +
 +      if (!qtypes)
 +              return 0;
 +
 +      mutex_lock(&inode->ei_quota_lock);
 +
 +      ret = bch2_quota_transfer(c, qtypes, new_qid,
 +                                inode->ei_qid,
 +                                inode->v.i_blocks +
 +                                inode->ei_quota_reserved,
 +                                mode);
 +      if (!ret)
 +              for (i = 0; i < QTYP_NR; i++)
 +                      if (qtypes & (1 << i))
 +                              inode->ei_qid.q[i] = new_qid.q[i];
 +
 +      mutex_unlock(&inode->ei_quota_lock);
 +
 +      return ret;
 +}
 +
 +static int bch2_iget5_test(struct inode *vinode, void *p)
 +{
 +      struct bch_inode_info *inode = to_bch_ei(vinode);
 +      subvol_inum *inum = p;
 +
 +      return inode->ei_subvol == inum->subvol &&
 +              inode->ei_inode.bi_inum == inum->inum;
 +}
 +
 +static int bch2_iget5_set(struct inode *vinode, void *p)
 +{
 +      struct bch_inode_info *inode = to_bch_ei(vinode);
 +      subvol_inum *inum = p;
 +
 +      inode->v.i_ino          = inum->inum;
 +      inode->ei_subvol        = inum->subvol;
 +      inode->ei_inode.bi_inum = inum->inum;
 +      return 0;
 +}
 +
 +static unsigned bch2_inode_hash(subvol_inum inum)
 +{
 +      return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
 +}
 +
 +struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 +{
 +      struct bch_inode_unpacked inode_u;
 +      struct bch_inode_info *inode;
 +      struct btree_trans *trans;
 +      struct bch_subvolume subvol;
 +      int ret;
 +
 +      inode = to_bch_ei(iget5_locked(c->vfs_sb,
 +                                     bch2_inode_hash(inum),
 +                                     bch2_iget5_test,
 +                                     bch2_iget5_set,
 +                                     &inum));
 +      if (unlikely(!inode))
 +              return ERR_PTR(-ENOMEM);
 +      if (!(inode->v.i_state & I_NEW))
 +              return &inode->v;
 +
 +      trans = bch2_trans_get(c);
 +      ret = lockrestart_do(trans,
 +              bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
 +              bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
 +
 +      if (!ret)
 +              bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
 +      bch2_trans_put(trans);
 +
 +      if (ret) {
 +              iget_failed(&inode->v);
 +              return ERR_PTR(bch2_err_class(ret));
 +      }
 +
 +      mutex_lock(&c->vfs_inodes_lock);
 +      list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
 +      mutex_unlock(&c->vfs_inodes_lock);
 +
 +      unlock_new_inode(&inode->v);
 +
 +      return &inode->v;
 +}
 +
 +struct bch_inode_info *
 +__bch2_create(struct mnt_idmap *idmap,
 +            struct bch_inode_info *dir, struct dentry *dentry,
 +            umode_t mode, dev_t rdev, subvol_inum snapshot_src,
 +            unsigned flags)
 +{
 +      struct bch_fs *c = dir->v.i_sb->s_fs_info;
 +      struct btree_trans *trans;
 +      struct bch_inode_unpacked dir_u;
 +      struct bch_inode_info *inode, *old;
 +      struct bch_inode_unpacked inode_u;
 +      struct posix_acl *default_acl = NULL, *acl = NULL;
 +      subvol_inum inum;
 +      struct bch_subvolume subvol;
 +      u64 journal_seq = 0;
 +      int ret;
 +
 +      /*
 +       * preallocate acls + vfs inode before btree transaction, so that
 +       * nothing can fail after the transaction succeeds:
 +       */
 +#ifdef CONFIG_BCACHEFS_POSIX_ACL
 +      ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
 +      if (ret)
 +              return ERR_PTR(ret);
 +#endif
 +      inode = to_bch_ei(new_inode(c->vfs_sb));
 +      if (unlikely(!inode)) {
 +              inode = ERR_PTR(-ENOMEM);
 +              goto err;
 +      }
 +
 +      bch2_inode_init_early(c, &inode_u);
 +
 +      if (!(flags & BCH_CREATE_TMPFILE))
 +              mutex_lock(&dir->ei_update_lock);
 +
 +      trans = bch2_trans_get(c);
 +retry:
 +      bch2_trans_begin(trans);
 +
 +      ret   = bch2_create_trans(trans,
 +                                inode_inum(dir), &dir_u, &inode_u,
 +                                !(flags & BCH_CREATE_TMPFILE)
 +                                ? &dentry->d_name : NULL,
 +                                from_kuid(i_user_ns(&dir->v), current_fsuid()),
 +                                from_kgid(i_user_ns(&dir->v), current_fsgid()),
 +                                mode, rdev,
 +                                default_acl, acl, snapshot_src, flags) ?:
 +              bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
 +                              KEY_TYPE_QUOTA_PREALLOC);
 +      if (unlikely(ret))
 +              goto err_before_quota;
 +
 +      inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
 +      inum.inum = inode_u.bi_inum;
 +
 +      ret   = bch2_subvolume_get(trans, inum.subvol, true,
 +                                 BTREE_ITER_WITH_UPDATES, &subvol) ?:
 +              bch2_trans_commit(trans, NULL, &journal_seq, 0);
 +      if (unlikely(ret)) {
 +              bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
 +                              KEY_TYPE_QUOTA_WARN);
 +err_before_quota:
 +              if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +                      goto retry;
 +              goto err_trans;
 +      }
 +
 +      if (!(flags & BCH_CREATE_TMPFILE)) {
 +              bch2_inode_update_after_write(trans, dir, &dir_u,
 +                                            ATTR_MTIME|ATTR_CTIME);
 +              mutex_unlock(&dir->ei_update_lock);
 +      }
 +
 +      bch2_iget5_set(&inode->v, &inum);
 +      bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
 +
 +      set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
 +      set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
 +
 +      /*
 +       * we must insert the new inode into the inode cache before calling
 +       * bch2_trans_exit() and dropping locks, else we could race with another
 +       * thread pulling the inode in and modifying it:
 +       */
 +
 +      inode->v.i_state |= I_CREATING;
 +
 +      old = to_bch_ei(inode_insert5(&inode->v,
 +                                    bch2_inode_hash(inum),
 +                                    bch2_iget5_test,
 +                                    bch2_iget5_set,
 +                                    &inum));
 +      BUG_ON(!old);
 +
 +      if (unlikely(old != inode)) {
 +              /*
 +               * We raced, another process pulled the new inode into cache
 +               * before us:
 +               */
 +              make_bad_inode(&inode->v);
 +              iput(&inode->v);
 +
 +              inode = old;
 +      } else {
 +              mutex_lock(&c->vfs_inodes_lock);
 +              list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
 +              mutex_unlock(&c->vfs_inodes_lock);
 +              /*
 +               * we really don't want insert_inode_locked2() to be setting
 +               * I_NEW...
 +               */
 +              unlock_new_inode(&inode->v);
 +      }
 +
 +      bch2_trans_put(trans);
 +err:
 +      posix_acl_release(default_acl);
 +      posix_acl_release(acl);
 +      return inode;
 +err_trans:
 +      if (!(flags & BCH_CREATE_TMPFILE))
 +              mutex_unlock(&dir->ei_update_lock);
 +
 +      bch2_trans_put(trans);
 +      make_bad_inode(&inode->v);
 +      iput(&inode->v);
 +      inode = ERR_PTR(ret);
 +      goto err;
 +}
 +
 +/* methods */
 +
 +static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
 +                                unsigned int flags)
 +{
 +      struct bch_fs *c = vdir->i_sb->s_fs_info;
 +      struct bch_inode_info *dir = to_bch_ei(vdir);
 +      struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
 +      struct inode *vinode = NULL;
 +      subvol_inum inum = { .subvol = 1 };
 +      int ret;
 +
 +      ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
 +                               &dentry->d_name, &inum);
 +
 +      if (!ret)
 +              vinode = bch2_vfs_inode_get(c, inum);
 +
 +      return d_splice_alias(vinode, dentry);
 +}
 +
 +static int bch2_mknod(struct mnt_idmap *idmap,
 +                    struct inode *vdir, struct dentry *dentry,
 +                    umode_t mode, dev_t rdev)
 +{
 +      struct bch_inode_info *inode =
 +              __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
 +                            (subvol_inum) { 0 }, 0);
 +
 +      if (IS_ERR(inode))
 +              return bch2_err_class(PTR_ERR(inode));
 +
 +      d_instantiate(dentry, &inode->v);
 +      return 0;
 +}
 +
 +static int bch2_create(struct mnt_idmap *idmap,
 +                     struct inode *vdir, struct dentry *dentry,
 +                     umode_t mode, bool excl)
 +{
 +      return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
 +}
 +
 +static int __bch2_link(struct bch_fs *c,
 +                     struct bch_inode_info *inode,
 +                     struct bch_inode_info *dir,
 +                     struct dentry *dentry)
 +{
 +      struct btree_trans *trans = bch2_trans_get(c);
 +      struct bch_inode_unpacked dir_u, inode_u;
 +      int ret;
 +
 +      mutex_lock(&inode->ei_update_lock);
 +
 +      ret = commit_do(trans, NULL, NULL, 0,
 +                      bch2_link_trans(trans,
 +                                      inode_inum(dir),   &dir_u,
 +                                      inode_inum(inode), &inode_u,
 +                                      &dentry->d_name));
 +
 +      if (likely(!ret)) {
 +              bch2_inode_update_after_write(trans, dir, &dir_u,
 +                                            ATTR_MTIME|ATTR_CTIME);
 +              bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
 +      }
 +
 +      bch2_trans_put(trans);
 +      mutex_unlock(&inode->ei_update_lock);
 +      return ret;
 +}
 +
 +static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
 +                   struct dentry *dentry)
 +{
 +      struct bch_fs *c = vdir->i_sb->s_fs_info;
 +      struct bch_inode_info *dir = to_bch_ei(vdir);
 +      struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
 +      int ret;
 +
 +      lockdep_assert_held(&inode->v.i_rwsem);
 +
 +      ret = __bch2_link(c, inode, dir, dentry);
 +      if (unlikely(ret))
 +              return ret;
 +
 +      ihold(&inode->v);
 +      d_instantiate(dentry, &inode->v);
 +      return 0;
 +}
 +
 +int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
 +                bool deleting_snapshot)
 +{
 +      struct bch_fs *c = vdir->i_sb->s_fs_info;
 +      struct bch_inode_info *dir = to_bch_ei(vdir);
 +      struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
 +      struct bch_inode_unpacked dir_u, inode_u;
 +      struct btree_trans *trans = bch2_trans_get(c);
 +      int ret;
 +
 +      bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
 +
 +      ret = commit_do(trans, NULL, NULL,
 +                      BTREE_INSERT_NOFAIL,
 +              bch2_unlink_trans(trans,
 +                                inode_inum(dir), &dir_u,
 +                                &inode_u, &dentry->d_name,
 +                                deleting_snapshot));
 +      if (unlikely(ret))
 +              goto err;
 +
 +      bch2_inode_update_after_write(trans, dir, &dir_u,
 +                                    ATTR_MTIME|ATTR_CTIME);
 +      bch2_inode_update_after_write(trans, inode, &inode_u,
 +                                    ATTR_MTIME);
 +
 +      if (inode_u.bi_subvol) {
 +              /*
 +               * Subvolume deletion is asynchronous, but we still want to tell
 +               * the VFS that it's been deleted here:
 +               */
 +              set_nlink(&inode->v, 0);
 +      }
 +err:
 +      bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
 +      bch2_trans_put(trans);
 +
 +      return ret;
 +}
 +
 +static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 +{
 +      return __bch2_unlink(vdir, dentry, false);
 +}
 +
 +static int bch2_symlink(struct mnt_idmap *idmap,
 +                      struct inode *vdir, struct dentry *dentry,
 +                      const char *symname)
 +{
 +      struct bch_fs *c = vdir->i_sb->s_fs_info;
 +      struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
 +      int ret;
 +
 +      inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
 +                            (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
 +      if (IS_ERR(inode))
 +              return bch2_err_class(PTR_ERR(inode));
 +
 +      inode_lock(&inode->v);
 +      ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
 +      inode_unlock(&inode->v);
 +
 +      if (unlikely(ret))
 +              goto err;
 +
 +      ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
 +      if (unlikely(ret))
 +              goto err;
 +
 +      ret = __bch2_link(c, inode, dir, dentry);
 +      if (unlikely(ret))
 +              goto err;
 +
 +      d_instantiate(dentry, &inode->v);
 +      return 0;
 +err:
 +      iput(&inode->v);
 +      return ret;
 +}
 +
 +static int bch2_mkdir(struct mnt_idmap *idmap,
 +                    struct inode *vdir, struct dentry *dentry, umode_t mode)
 +{
 +      return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
 +}
 +
 +static int bch2_rename2(struct mnt_idmap *idmap,
 +                      struct inode *src_vdir, struct dentry *src_dentry,
 +                      struct inode *dst_vdir, struct dentry *dst_dentry,
 +                      unsigned flags)
 +{
 +      struct bch_fs *c = src_vdir->i_sb->s_fs_info;
 +      struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
 +      struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
 +      struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
 +      struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
 +      struct bch_inode_unpacked dst_dir_u, src_dir_u;
 +      struct bch_inode_unpacked src_inode_u, dst_inode_u;
 +      struct btree_trans *trans;
 +      enum bch_rename_mode mode = flags & RENAME_EXCHANGE
 +              ? BCH_RENAME_EXCHANGE
 +              : dst_dentry->d_inode
 +              ? BCH_RENAME_OVERWRITE : BCH_RENAME;
 +      int ret;
 +
 +      if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
 +              return -EINVAL;
 +
 +      if (mode == BCH_RENAME_OVERWRITE) {
 +              ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
 +                                                 0, LLONG_MAX);
 +              if (ret)
 +                      return ret;
 +      }
 +
 +      trans = bch2_trans_get(c);
 +
 +      bch2_lock_inodes(INODE_UPDATE_LOCK,
 +                       src_dir,
 +                       dst_dir,
 +                       src_inode,
 +                       dst_inode);
 +
 +      if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
 +              ret = bch2_fs_quota_transfer(c, src_inode,
 +                                           dst_dir->ei_qid,
 +                                           1 << QTYP_PRJ,
 +                                           KEY_TYPE_QUOTA_PREALLOC);
 +              if (ret)
 +                      goto err;
 +      }
 +
 +      if (mode == BCH_RENAME_EXCHANGE &&
 +          inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
 +              ret = bch2_fs_quota_transfer(c, dst_inode,
 +                                           src_dir->ei_qid,
 +                                           1 << QTYP_PRJ,
 +                                           KEY_TYPE_QUOTA_PREALLOC);
 +              if (ret)
 +                      goto err;
 +      }
 +
 +      ret = commit_do(trans, NULL, NULL, 0,
 +                      bch2_rename_trans(trans,
 +                                        inode_inum(src_dir), &src_dir_u,
 +                                        inode_inum(dst_dir), &dst_dir_u,
 +                                        &src_inode_u,
 +                                        &dst_inode_u,
 +                                        &src_dentry->d_name,
 +                                        &dst_dentry->d_name,
 +                                        mode));
 +      if (unlikely(ret))
 +              goto err;
 +
 +      BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
 +      BUG_ON(dst_inode &&
 +             dst_inode->v.i_ino != dst_inode_u.bi_inum);
 +
 +      bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
 +                                    ATTR_MTIME|ATTR_CTIME);
 +
 +      if (src_dir != dst_dir)
 +              bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
 +                                            ATTR_MTIME|ATTR_CTIME);
 +
 +      bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
 +                                    ATTR_CTIME);
 +
 +      if (dst_inode)
 +              bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
 +                                            ATTR_CTIME);
 +err:
 +      bch2_trans_put(trans);
 +
 +      bch2_fs_quota_transfer(c, src_inode,
 +                             bch_qid(&src_inode->ei_inode),
 +                             1 << QTYP_PRJ,
 +                             KEY_TYPE_QUOTA_NOCHECK);
 +      if (dst_inode)
 +              bch2_fs_quota_transfer(c, dst_inode,
 +                                     bch_qid(&dst_inode->ei_inode),
 +                                     1 << QTYP_PRJ,
 +                                     KEY_TYPE_QUOTA_NOCHECK);
 +
 +      bch2_unlock_inodes(INODE_UPDATE_LOCK,
 +                         src_dir,
 +                         dst_dir,
 +                         src_inode,
 +                         dst_inode);
 +
 +      return ret;
 +}
 +
 +static void bch2_setattr_copy(struct mnt_idmap *idmap,
 +                            struct bch_inode_info *inode,
 +                            struct bch_inode_unpacked *bi,
 +                            struct iattr *attr)
 +{
 +      struct bch_fs *c = inode->v.i_sb->s_fs_info;
 +      unsigned int ia_valid = attr->ia_valid;
 +
 +      if (ia_valid & ATTR_UID)
 +              bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
 +      if (ia_valid & ATTR_GID)
 +              bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
 +
 +      if (ia_valid & ATTR_SIZE)
 +              bi->bi_size = attr->ia_size;
 +
 +      if (ia_valid & ATTR_ATIME)
 +              bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
 +      if (ia_valid & ATTR_MTIME)
 +              bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
 +      if (ia_valid & ATTR_CTIME)
 +              bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
 +
 +      if (ia_valid & ATTR_MODE) {
 +              umode_t mode = attr->ia_mode;
 +              kgid_t gid = ia_valid & ATTR_GID
 +                      ? attr->ia_gid
 +                      : inode->v.i_gid;
 +
 +              if (!in_group_p(gid) &&
 +                  !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
 +                      mode &= ~S_ISGID;
 +              bi->bi_mode = mode;
 +      }
 +}
 +
 +int bch2_setattr_nonsize(struct mnt_idmap *idmap,
 +                       struct bch_inode_info *inode,
 +                       struct iattr *attr)
 +{
 +      struct bch_fs *c = inode->v.i_sb->s_fs_info;
 +      struct bch_qid qid;
 +      struct btree_trans *trans;
 +      struct btree_iter inode_iter = { NULL };
 +      struct bch_inode_unpacked inode_u;
 +      struct posix_acl *acl = NULL;
 +      int ret;
 +
 +      mutex_lock(&inode->ei_update_lock);
 +
 +      qid = inode->ei_qid;
 +
 +      if (attr->ia_valid & ATTR_UID)
 +              qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
 +
 +      if (attr->ia_valid & ATTR_GID)
 +              qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
 +
 +      ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
 +                                   KEY_TYPE_QUOTA_PREALLOC);
 +      if (ret)
 +              goto err;
 +
 +      trans = bch2_trans_get(c);
 +retry:
 +      bch2_trans_begin(trans);
 +      kfree(acl);
 +      acl = NULL;
 +
 +      ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
 +                            BTREE_ITER_INTENT);
 +      if (ret)
 +              goto btree_err;
 +
 +      bch2_setattr_copy(idmap, inode, &inode_u, attr);
 +
 +      if (attr->ia_valid & ATTR_MODE) {
 +              ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
 +                                   inode_u.bi_mode, &acl);
 +              if (ret)
 +                      goto btree_err;
 +      }
 +
 +      ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
 +              bch2_trans_commit(trans, NULL, NULL,
 +                                BTREE_INSERT_NOFAIL);
 +btree_err:
 +      bch2_trans_iter_exit(trans, &inode_iter);
 +
 +      if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +              goto retry;
 +      if (unlikely(ret))
 +              goto err_trans;
 +
 +      bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
 +
 +      if (acl)
 +              set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
 +err_trans:
 +      bch2_trans_put(trans);
 +err:
 +      mutex_unlock(&inode->ei_update_lock);
 +
 +      return bch2_err_class(ret);
 +}
 +
 +static int bch2_getattr(struct mnt_idmap *idmap,
 +                      const struct path *path, struct kstat *stat,
 +                      u32 request_mask, unsigned query_flags)
 +{
 +      struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
 +      struct bch_fs *c = inode->v.i_sb->s_fs_info;
 +
 +      stat->dev       = inode->v.i_sb->s_dev;
 +      stat->ino       = inode->v.i_ino;
 +      stat->mode      = inode->v.i_mode;
 +      stat->nlink     = inode->v.i_nlink;
 +      stat->uid       = inode->v.i_uid;
 +      stat->gid       = inode->v.i_gid;
 +      stat->rdev      = inode->v.i_rdev;
 +      stat->size      = i_size_read(&inode->v);
 +      stat->atime     = inode_get_atime(&inode->v);
 +      stat->mtime     = inode_get_mtime(&inode->v);
 +      stat->ctime     = inode_get_ctime(&inode->v);
 +      stat->blksize   = block_bytes(c);
 +      stat->blocks    = inode->v.i_blocks;
 +
 +      if (request_mask & STATX_BTIME) {
 +              stat->result_mask |= STATX_BTIME;
 +              stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
 +      }
 +
 +      if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
 +              stat->attributes |= STATX_ATTR_IMMUTABLE;
 +      stat->attributes_mask    |= STATX_ATTR_IMMUTABLE;
 +
 +      if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
 +              stat->attributes |= STATX_ATTR_APPEND;
 +      stat->attributes_mask    |= STATX_ATTR_APPEND;
 +
 +      if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
 +              stat->attributes |= STATX_ATTR_NODUMP;
 +      stat->attributes_mask    |= STATX_ATTR_NODUMP;
 +
 +      return 0;
 +}
 +
 +static int bch2_setattr(struct mnt_idmap *idmap,
 +                      struct dentry *dentry, struct iattr *iattr)
 +{
 +      struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
 +      int ret;
 +
 +      lockdep_assert_held(&inode->v.i_rwsem);
 +
 +      ret = setattr_prepare(idmap, dentry, iattr);
 +      if (ret)
 +              return ret;
 +
 +      return iattr->ia_valid & ATTR_SIZE
 +              ? bchfs_truncate(idmap, inode, iattr)
 +              : bch2_setattr_nonsize(idmap, inode, iattr);
 +}
 +
 +static int bch2_tmpfile(struct mnt_idmap *idmap,
 +                      struct inode *vdir, struct file *file, umode_t mode)
 +{
 +      struct bch_inode_info *inode =
 +              __bch2_create(idmap, to_bch_ei(vdir),
 +                            file->f_path.dentry, mode, 0,
 +                            (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
 +
 +      if (IS_ERR(inode))
 +              return bch2_err_class(PTR_ERR(inode));
 +
 +      d_mark_tmpfile(file, &inode->v);
 +      d_instantiate(file->f_path.dentry, &inode->v);
 +      return finish_open_simple(file, 0);
 +}
 +
 +static int bch2_fill_extent(struct bch_fs *c,
 +                          struct fiemap_extent_info *info,
 +                          struct bkey_s_c k, unsigned flags)
 +{
 +      if (bkey_extent_is_direct_data(k.k)) {
 +              struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 +              const union bch_extent_entry *entry;
 +              struct extent_ptr_decoded p;
 +              int ret;
 +
 +              if (k.k->type == KEY_TYPE_reflink_v)
 +                      flags |= FIEMAP_EXTENT_SHARED;
 +
 +              bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 +                      int flags2 = 0;
 +                      u64 offset = p.ptr.offset;
 +
 +                      if (p.ptr.unwritten)
 +                              flags2 |= FIEMAP_EXTENT_UNWRITTEN;
 +
 +                      if (p.crc.compression_type)
 +                              flags2 |= FIEMAP_EXTENT_ENCODED;
 +                      else
 +                              offset += p.crc.offset;
 +
 +                      if ((offset & (block_sectors(c) - 1)) ||
 +                          (k.k->size & (block_sectors(c) - 1)))
 +                              flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
 +
 +                      ret = fiemap_fill_next_extent(info,
 +                                              bkey_start_offset(k.k) << 9,
 +                                              offset << 9,
 +                                              k.k->size << 9, flags|flags2);
 +                      if (ret)
 +                              return ret;
 +              }
 +
 +              return 0;
 +      } else if (bkey_extent_is_inline_data(k.k)) {
 +              return fiemap_fill_next_extent(info,
 +                                             bkey_start_offset(k.k) << 9,
 +                                             0, k.k->size << 9,
 +                                             flags|
 +                                             FIEMAP_EXTENT_DATA_INLINE);
 +      } else if (k.k->type == KEY_TYPE_reservation) {
 +              return fiemap_fill_next_extent(info,
 +                                             bkey_start_offset(k.k) << 9,
 +                                             0, k.k->size << 9,
 +                                             flags|
 +                                             FIEMAP_EXTENT_DELALLOC|
 +                                             FIEMAP_EXTENT_UNWRITTEN);
 +      } else {
 +              BUG();
 +      }
 +}
 +
 +static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 +                     u64 start, u64 len)
 +{
 +      struct bch_fs *c = vinode->i_sb->s_fs_info;
 +      struct bch_inode_info *ei = to_bch_ei(vinode);
 +      struct btree_trans *trans;
 +      struct btree_iter iter;
 +      struct bkey_s_c k;
 +      struct bkey_buf cur, prev;
 +      struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
 +      unsigned offset_into_extent, sectors;
 +      bool have_extent = false;
 +      u32 snapshot;
 +      int ret = 0;
 +
 +      ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
 +      if (ret)
 +              return ret;
 +
 +      if (start + len < start)
 +              return -EINVAL;
 +
 +      start >>= 9;
 +
 +      bch2_bkey_buf_init(&cur);
 +      bch2_bkey_buf_init(&prev);
 +      trans = bch2_trans_get(c);
 +retry:
 +      bch2_trans_begin(trans);
 +
 +      ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
 +      if (ret)
 +              goto err;
 +
 +      bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 +                           SPOS(ei->v.i_ino, start, snapshot), 0);
 +
 +      while (!(ret = btree_trans_too_many_iters(trans)) &&
 +             (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
 +             !(ret = bkey_err(k))) {
 +              enum btree_id data_btree = BTREE_ID_extents;
 +
 +              if (!bkey_extent_is_data(k.k) &&
 +                  k.k->type != KEY_TYPE_reservation) {
 +                      bch2_btree_iter_advance(&iter);
 +                      continue;
 +              }
 +
 +              offset_into_extent      = iter.pos.offset -
 +                      bkey_start_offset(k.k);
 +              sectors                 = k.k->size - offset_into_extent;
 +
 +              bch2_bkey_buf_reassemble(&cur, c, k);
 +
 +              ret = bch2_read_indirect_extent(trans, &data_btree,
 +                                      &offset_into_extent, &cur);
 +              if (ret)
 +                      break;
 +
 +              k = bkey_i_to_s_c(cur.k);
 +              bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
 +
 +              sectors = min(sectors, k.k->size - offset_into_extent);
 +
 +              bch2_cut_front(POS(k.k->p.inode,
 +                                 bkey_start_offset(k.k) +
 +                                 offset_into_extent),
 +                             cur.k);
 +              bch2_key_resize(&cur.k->k, sectors);
 +              cur.k->k.p = iter.pos;
 +              cur.k->k.p.offset += cur.k->k.size;
 +
 +              if (have_extent) {
 +                      bch2_trans_unlock(trans);
 +                      ret = bch2_fill_extent(c, info,
 +                                      bkey_i_to_s_c(prev.k), 0);
 +                      if (ret)
 +                              break;
 +              }
 +
 +              bkey_copy(prev.k, cur.k);
 +              have_extent = true;
 +
 +              bch2_btree_iter_set_pos(&iter,
 +                      POS(iter.pos.inode, iter.pos.offset + sectors));
 +      }
 +      start = iter.pos.offset;
 +      bch2_trans_iter_exit(trans, &iter);
 +err:
 +      if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +              goto retry;
 +
 +      if (!ret && have_extent) {
 +              bch2_trans_unlock(trans);
 +              ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
 +                                     FIEMAP_EXTENT_LAST);
 +      }
 +
 +      bch2_trans_put(trans);
 +      bch2_bkey_buf_exit(&cur, c);
 +      bch2_bkey_buf_exit(&prev, c);
 +      return ret < 0 ? ret : 0;
 +}
 +
 +static const struct vm_operations_struct bch_vm_ops = {
 +      .fault          = bch2_page_fault,
 +      .map_pages      = filemap_map_pages,
 +      .page_mkwrite   = bch2_page_mkwrite,
 +};
 +
 +static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
 +{
 +      file_accessed(file);
 +
 +      vma->vm_ops = &bch_vm_ops;
 +      return 0;
 +}
 +
 +/* Directories: */
 +
 +static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
 +{
 +      return generic_file_llseek_size(file, offset, whence,
 +                                      S64_MAX, S64_MAX);
 +}
 +
 +static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
 +{
 +      struct bch_inode_info *inode = file_bch_inode(file);
 +      struct bch_fs *c = inode->v.i_sb->s_fs_info;
 +      int ret;
 +
 +      if (!dir_emit_dots(file, ctx))
 +              return 0;
 +
 +      ret = bch2_readdir(c, inode_inum(inode), ctx);
 +      if (ret)
 +              bch_err_fn(c, ret);
 +
 +      return bch2_err_class(ret);
 +}
 +
 +static const struct file_operations bch_file_operations = {
 +      .llseek         = bch2_llseek,
 +      .read_iter      = bch2_read_iter,
 +      .write_iter     = bch2_write_iter,
 +      .mmap           = bch2_mmap,
 +      .open           = generic_file_open,
 +      .fsync          = bch2_fsync,
 +      .splice_read    = filemap_splice_read,
 +      .splice_write   = iter_file_splice_write,
 +      .fallocate      = bch2_fallocate_dispatch,
 +      .unlocked_ioctl = bch2_fs_file_ioctl,
 +#ifdef CONFIG_COMPAT
 +      .compat_ioctl   = bch2_compat_fs_ioctl,
 +#endif
 +      .remap_file_range = bch2_remap_file_range,
 +};
 +
 +static const struct inode_operations bch_file_inode_operations = {
 +      .getattr        = bch2_getattr,
 +      .setattr        = bch2_setattr,
 +      .fiemap         = bch2_fiemap,
 +      .listxattr      = bch2_xattr_list,
 +#ifdef CONFIG_BCACHEFS_POSIX_ACL
 +      .get_acl        = bch2_get_acl,
 +      .set_acl        = bch2_set_acl,
 +#endif
 +};
 +
 +static const struct inode_operations bch_dir_inode_operations = {
 +      .lookup         = bch2_lookup,
 +      .create         = bch2_create,
 +      .link           = bch2_link,
 +      .unlink         = bch2_unlink,
 +      .symlink        = bch2_symlink,
 +      .mkdir          = bch2_mkdir,
 +      .rmdir          = bch2_unlink,
 +      .mknod          = bch2_mknod,
 +      .rename         = bch2_rename2,
 +      .getattr        = bch2_getattr,
 +      .setattr        = bch2_setattr,
 +      .tmpfile        = bch2_tmpfile,
 +      .listxattr      = bch2_xattr_list,
 +#ifdef CONFIG_BCACHEFS_POSIX_ACL
 +      .get_acl        = bch2_get_acl,
 +      .set_acl        = bch2_set_acl,
 +#endif
 +};
 +
 +static const struct file_operations bch_dir_file_operations = {
 +      .llseek         = bch2_dir_llseek,
 +      .read           = generic_read_dir,
 +      .iterate_shared = bch2_vfs_readdir,
 +      .fsync          = bch2_fsync,
 +      .unlocked_ioctl = bch2_fs_file_ioctl,
 +#ifdef CONFIG_COMPAT
 +      .compat_ioctl   = bch2_compat_fs_ioctl,
 +#endif
 +};
 +
 +static const struct inode_operations bch_symlink_inode_operations = {
 +      .get_link       = page_get_link,
 +      .getattr        = bch2_getattr,
 +      .setattr        = bch2_setattr,
 +      .listxattr      = bch2_xattr_list,
 +#ifdef CONFIG_BCACHEFS_POSIX_ACL
 +      .get_acl        = bch2_get_acl,
 +      .set_acl        = bch2_set_acl,
 +#endif
 +};
 +
 +static const struct inode_operations bch_special_inode_operations = {
 +      .getattr        = bch2_getattr,
 +      .setattr        = bch2_setattr,
 +      .listxattr      = bch2_xattr_list,
 +#ifdef CONFIG_BCACHEFS_POSIX_ACL
 +      .get_acl        = bch2_get_acl,
 +      .set_acl        = bch2_set_acl,
 +#endif
 +};
 +
 +static const struct address_space_operations bch_address_space_operations = {
 +      .read_folio     = bch2_read_folio,
 +      .writepages     = bch2_writepages,
 +      .readahead      = bch2_readahead,
 +      .dirty_folio    = filemap_dirty_folio,
 +      .write_begin    = bch2_write_begin,
 +      .write_end      = bch2_write_end,
 +      .invalidate_folio = bch2_invalidate_folio,
 +      .release_folio  = bch2_release_folio,
 +      .direct_IO      = noop_direct_IO,
 +#ifdef CONFIG_MIGRATION
 +      .migrate_folio  = filemap_migrate_folio,
 +#endif
 +      .error_remove_page = generic_error_remove_page,
 +};
 +
 +struct bcachefs_fid {
 +      u64             inum;
 +      u32             subvol;
 +      u32             gen;
 +} __packed;
 +
 +struct bcachefs_fid_with_parent {
 +      struct bcachefs_fid     fid;
 +      struct bcachefs_fid     dir;
 +} __packed;
 +
 +static int bcachefs_fid_valid(int fh_len, int fh_type)
 +{
 +      switch (fh_type) {
 +      case FILEID_BCACHEFS_WITHOUT_PARENT:
 +              return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
 +      case FILEID_BCACHEFS_WITH_PARENT:
 +              return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
 +      default:
 +              return false;
 +      }
 +}
 +
 +static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
 +{
 +      return (struct bcachefs_fid) {
 +              .inum   = inode->ei_inode.bi_inum,
 +              .subvol = inode->ei_subvol,
 +              .gen    = inode->ei_inode.bi_generation,
 +      };
 +}
 +
 +static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
 +                        struct inode *vdir)
 +{
 +      struct bch_inode_info *inode    = to_bch_ei(vinode);
 +      struct bch_inode_info *dir      = to_bch_ei(vdir);
 +
 +      if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32))
 +              return FILEID_INVALID;
 +
 +      if (!S_ISDIR(inode->v.i_mode) && dir) {
 +              struct bcachefs_fid_with_parent *fid = (void *) fh;
 +
 +              fid->fid = bch2_inode_to_fid(inode);
 +              fid->dir = bch2_inode_to_fid(dir);
 +
 +              *len = sizeof(*fid) / sizeof(u32);
 +              return FILEID_BCACHEFS_WITH_PARENT;
 +      } else {
 +              struct bcachefs_fid *fid = (void *) fh;
 +
 +              *fid = bch2_inode_to_fid(inode);
 +
 +              *len = sizeof(*fid) / sizeof(u32);
 +              return FILEID_BCACHEFS_WITHOUT_PARENT;
 +      }
 +}
 +
 +static struct inode *bch2_nfs_get_inode(struct super_block *sb,
 +                                      struct bcachefs_fid fid)
 +{
 +      struct bch_fs *c = sb->s_fs_info;
 +      struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
 +                                  .subvol = fid.subvol,
 +                                  .inum = fid.inum,
 +      });
 +      if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
 +              iput(vinode);
 +              vinode = ERR_PTR(-ESTALE);
 +      }
 +      return vinode;
 +}
 +
 +static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
 +              int fh_len, int fh_type)
 +{
 +      struct bcachefs_fid *fid = (void *) _fid;
 +
 +      if (!bcachefs_fid_valid(fh_len, fh_type))
 +              return NULL;
 +
 +      return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
 +}
 +
 +static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
 +              int fh_len, int fh_type)
 +{
 +      struct bcachefs_fid_with_parent *fid = (void *) _fid;
 +
 +      if (!bcachefs_fid_valid(fh_len, fh_type) ||
 +          fh_type != FILEID_BCACHEFS_WITH_PARENT)
 +              return NULL;
 +
 +      return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
 +}
 +
 +static struct dentry *bch2_get_parent(struct dentry *child)
 +{
 +      struct bch_inode_info *inode = to_bch_ei(child->d_inode);
 +      struct bch_fs *c = inode->v.i_sb->s_fs_info;
 +      subvol_inum parent_inum = {
 +              .subvol = inode->ei_inode.bi_parent_subvol ?:
 +                      inode->ei_subvol,
 +              .inum = inode->ei_inode.bi_dir,
 +      };
 +
 +      if (!parent_inum.inum)
 +              return NULL;
 +
 +      return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
 +}
 +
 +static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
 +{
 +      struct bch_inode_info *inode    = to_bch_ei(child->d_inode);
 +      struct bch_inode_info *dir      = to_bch_ei(parent->d_inode);
 +      struct bch_fs *c = inode->v.i_sb->s_fs_info;
 +      struct btree_trans *trans;
 +      struct btree_iter iter1;
 +      struct btree_iter iter2;
 +      struct bkey_s_c k;
 +      struct bkey_s_c_dirent d;
 +      struct bch_inode_unpacked inode_u;
 +      subvol_inum target;
 +      u32 snapshot;
 +      struct qstr dirent_name;
 +      unsigned name_len = 0;
 +      int ret;
 +
 +      if (!S_ISDIR(dir->v.i_mode))
 +              return -EINVAL;
 +
 +      trans = bch2_trans_get(c);
 +
 +      bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
 +                           POS(dir->ei_inode.bi_inum, 0), 0);
 +      bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
 +                           POS(dir->ei_inode.bi_inum, 0), 0);
 +retry:
 +      bch2_trans_begin(trans);
 +
 +      ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
 +      if (ret)
 +              goto err;
 +
 +      bch2_btree_iter_set_snapshot(&iter1, snapshot);
 +      bch2_btree_iter_set_snapshot(&iter2, snapshot);
 +
 +      ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
 +      if (ret)
 +              goto err;
 +
 +      if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
 +              bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
 +
 +              k = bch2_btree_iter_peek_slot(&iter1);
 +              ret = bkey_err(k);
 +              if (ret)
 +                      goto err;
 +
 +              if (k.k->type != KEY_TYPE_dirent) {
 +                      ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
 +                      goto err;
 +              }
 +
 +              d = bkey_s_c_to_dirent(k);
 +              ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
 +              if (ret > 0)
 +                      ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
 +              if (ret)
 +                      goto err;
 +
 +              if (target.subvol       == inode->ei_subvol &&
 +                  target.inum         == inode->ei_inode.bi_inum)
 +                      goto found;
 +      } else {
 +              /*
 +               * File with multiple hardlinks and our backref is to the wrong
 +               * directory - linear search:
 +               */
 +              for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
 +                      if (k.k->p.inode > dir->ei_inode.bi_inum)
 +                              break;
 +
 +                      if (k.k->type != KEY_TYPE_dirent)
 +                              continue;
 +
 +                      d = bkey_s_c_to_dirent(k);
 +                      ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
 +                      if (ret < 0)
 +                              break;
 +                      if (ret)
 +                              continue;
 +
 +                      if (target.subvol       == inode->ei_subvol &&
 +                          target.inum         == inode->ei_inode.bi_inum)
 +                              goto found;
 +              }
 +      }
 +
 +      ret = -ENOENT;
 +      goto err;
 +found:
 +      dirent_name = bch2_dirent_get_name(d);
 +
 +      name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
 +      memcpy(name, dirent_name.name, name_len);
 +      name[name_len] = '\0';
 +err:
 +      if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +              goto retry;
 +
 +      bch2_trans_iter_exit(trans, &iter1);
 +      bch2_trans_iter_exit(trans, &iter2);
 +      bch2_trans_put(trans);
 +
 +      return ret;
 +}
 +
 +static const struct export_operations bch_export_ops = {
 +      .encode_fh      = bch2_encode_fh,
 +      .fh_to_dentry   = bch2_fh_to_dentry,
 +      .fh_to_parent   = bch2_fh_to_parent,
 +      .get_parent     = bch2_get_parent,
 +      .get_name       = bch2_get_name,
 +};
 +
 +static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
 +                              struct bch_inode_info *inode,
 +                              struct bch_inode_unpacked *bi,
 +                              struct bch_subvolume *subvol)
 +{
 +      bch2_inode_update_after_write(trans, inode, bi, ~0);
 +
 +      if (BCH_SUBVOLUME_SNAP(subvol))
 +              set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
 +      else
 +              clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
 +
 +      inode->v.i_blocks       = bi->bi_sectors;
 +      inode->v.i_ino          = bi->bi_inum;
 +      inode->v.i_rdev         = bi->bi_dev;
 +      inode->v.i_generation   = bi->bi_generation;
 +      inode->v.i_size         = bi->bi_size;
 +
 +      inode->ei_flags         = 0;
 +      inode->ei_quota_reserved = 0;
 +      inode->ei_qid           = bch_qid(bi);
 +      inode->ei_subvol        = inum.subvol;
 +
 +      inode->v.i_mapping->a_ops = &bch_address_space_operations;
 +
 +      switch (inode->v.i_mode & S_IFMT) {
 +      case S_IFREG:
 +              inode->v.i_op   = &bch_file_inode_operations;
 +              inode->v.i_fop  = &bch_file_operations;
 +              break;
 +      case S_IFDIR:
 +              inode->v.i_op   = &bch_dir_inode_operations;
 +              inode->v.i_fop  = &bch_dir_file_operations;
 +              break;
 +      case S_IFLNK:
 +              inode_nohighmem(&inode->v);
 +              inode->v.i_op   = &bch_symlink_inode_operations;
 +              break;
 +      default:
 +              init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
 +              inode->v.i_op   = &bch_special_inode_operations;
 +              break;
 +      }
 +
 +      mapping_set_large_folios(inode->v.i_mapping);
 +}
 +
 +static struct inode *bch2_alloc_inode(struct super_block *sb)
 +{
 +      struct bch_inode_info *inode;
 +
 +      inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
 +      if (!inode)
 +              return NULL;
 +
 +      inode_init_once(&inode->v);
 +      mutex_init(&inode->ei_update_lock);
 +      two_state_lock_init(&inode->ei_pagecache_lock);
 +      INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
 +      mutex_init(&inode->ei_quota_lock);
 +
 +      return &inode->v;
 +}
 +
 +static void bch2_i_callback(struct rcu_head *head)
 +{
 +      struct inode *vinode = container_of(head, struct inode, i_rcu);
 +      struct bch_inode_info *inode = to_bch_ei(vinode);
 +
 +      kmem_cache_free(bch2_inode_cache, inode);
 +}
 +
 +static void bch2_destroy_inode(struct inode *vinode)
 +{
 +      call_rcu(&vinode->i_rcu, bch2_i_callback);
 +}
 +
 +static int inode_update_times_fn(struct btree_trans *trans,
 +                               struct bch_inode_info *inode,
 +                               struct bch_inode_unpacked *bi,
 +                               void *p)
 +{
 +      struct bch_fs *c = inode->v.i_sb->s_fs_info;
 +
 +      bi->bi_atime    = timespec_to_bch2_time(c, inode_get_atime(&inode->v));
 +      bi->bi_mtime    = timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
 +      bi->bi_ctime    = timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
 +
 +      return 0;
 +}
 +
 +static int bch2_vfs_write_inode(struct inode *vinode,
 +                              struct writeback_control *wbc)
 +{
 +      struct bch_fs *c = vinode->i_sb->s_fs_info;
 +      struct bch_inode_info *inode = to_bch_ei(vinode);
 +      int ret;
 +
 +      mutex_lock(&inode->ei_update_lock);
 +      ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
 +                             ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
 +      mutex_unlock(&inode->ei_update_lock);
 +
 +      return bch2_err_class(ret);
 +}
 +
 +static void bch2_evict_inode(struct inode *vinode)
 +{
 +      struct bch_fs *c = vinode->i_sb->s_fs_info;
 +      struct bch_inode_info *inode = to_bch_ei(vinode);
 +
 +      truncate_inode_pages_final(&inode->v.i_data);
 +
 +      clear_inode(&inode->v);
 +
 +      BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
 +
 +      if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
 +              bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
 +                              KEY_TYPE_QUOTA_WARN);
 +              bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
 +                              KEY_TYPE_QUOTA_WARN);
 +              bch2_inode_rm(c, inode_inum(inode));
 +      }
 +
 +      mutex_lock(&c->vfs_inodes_lock);
 +      list_del_init(&inode->ei_vfs_inode_list);
 +      mutex_unlock(&c->vfs_inodes_lock);
 +}
 +
 +void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
 +{
 +      struct bch_inode_info *inode, **i;
 +      DARRAY(struct bch_inode_info *) grabbed;
 +      bool clean_pass = false, this_pass_clean;
 +
 +      /*
 +       * Initially, we scan for inodes without I_DONTCACHE, then mark them to
 +       * be pruned with d_mark_dontcache().
 +       *
 +       * Once we've had a clean pass where we didn't find any inodes without
 +       * I_DONTCACHE, we wait for them to be freed:
 +       */
 +
 +      darray_init(&grabbed);
 +      darray_make_room(&grabbed, 1024);
 +again:
 +      cond_resched();
 +      this_pass_clean = true;
 +
 +      mutex_lock(&c->vfs_inodes_lock);
 +      list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
 +              if (!snapshot_list_has_id(s, inode->ei_subvol))
 +                      continue;
 +
 +              if (!(inode->v.i_state & I_DONTCACHE) &&
 +                  !(inode->v.i_state & I_FREEING) &&
 +                  igrab(&inode->v)) {
 +                      this_pass_clean = false;
 +
 +                      if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
 +                              iput(&inode->v);
 +                              break;
 +                      }
 +              } else if (clean_pass && this_pass_clean) {
 +                      wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
 +                      DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
 +
 +                      prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 +                      mutex_unlock(&c->vfs_inodes_lock);
 +
 +                      schedule();
 +                      finish_wait(wq, &wait.wq_entry);
 +                      goto again;
 +              }
 +      }
 +      mutex_unlock(&c->vfs_inodes_lock);
 +
 +      darray_for_each(grabbed, i) {
 +              inode = *i;
 +              d_mark_dontcache(&inode->v);
 +              d_prune_aliases(&inode->v);
 +              iput(&inode->v);
 +      }
 +      grabbed.nr = 0;
 +
 +      if (!clean_pass || !this_pass_clean) {
 +              clean_pass = this_pass_clean;
 +              goto again;
 +      }
 +
 +      darray_exit(&grabbed);
 +}
 +
 +static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 +{
 +      struct super_block *sb = dentry->d_sb;
 +      struct bch_fs *c = sb->s_fs_info;
 +      struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
 +      unsigned shift = sb->s_blocksize_bits - 9;
 +      /*
 +       * this assumes inodes take up 64 bytes, which is a decent average
 +       * number:
 +       */
 +      u64 avail_inodes = ((usage.capacity - usage.used) << 3);
 +      u64 fsid;
 +
 +      buf->f_type     = BCACHEFS_STATFS_MAGIC;
 +      buf->f_bsize    = sb->s_blocksize;
 +      buf->f_blocks   = usage.capacity >> shift;
 +      buf->f_bfree    = usage.free >> shift;
 +      buf->f_bavail   = avail_factor(usage.free) >> shift;
 +
 +      buf->f_files    = usage.nr_inodes + avail_inodes;
 +      buf->f_ffree    = avail_inodes;
 +
 +      fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
 +             le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
 +      buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
 +      buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
 +      buf->f_namelen  = BCH_NAME_MAX;
 +
 +      return 0;
 +}
 +
 +static int bch2_sync_fs(struct super_block *sb, int wait)
 +{
 +      struct bch_fs *c = sb->s_fs_info;
 +      int ret;
 +
 +      if (c->opts.journal_flush_disabled)
 +              return 0;
 +
 +      if (!wait) {
 +              bch2_journal_flush_async(&c->journal, NULL);
 +              return 0;
 +      }
 +
 +      ret = bch2_journal_flush(&c->journal);
 +      return bch2_err_class(ret);
 +}
 +
 +static struct bch_fs *bch2_path_to_fs(const char *path)
 +{
 +      struct bch_fs *c;
 +      dev_t dev;
 +      int ret;
 +
 +      ret = lookup_bdev(path, &dev);
 +      if (ret)
 +              return ERR_PTR(ret);
 +
 +      c = bch2_dev_to_fs(dev);
 +      if (c)
 +              closure_put(&c->cl);
 +      return c ?: ERR_PTR(-ENOENT);
 +}
 +
 +static char **split_devs(const char *_dev_name, unsigned *nr)
 +{
 +      char *dev_name = NULL, **devs = NULL, *s;
 +      size_t i = 0, nr_devs = 0;
 +
 +      dev_name = kstrdup(_dev_name, GFP_KERNEL);
 +      if (!dev_name)
 +              return NULL;
 +
 +      for (s = dev_name; s; s = strchr(s + 1, ':'))
 +              nr_devs++;
 +
 +      devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL);
 +      if (!devs) {
 +              kfree(dev_name);
 +              return NULL;
 +      }
 +
 +      while ((s = strsep(&dev_name, ":")))
 +              devs[i++] = s;
 +
 +      *nr = nr_devs;
 +      return devs;
 +}
 +
 +static int bch2_remount(struct super_block *sb, int *flags, char *data)
 +{
 +      struct bch_fs *c = sb->s_fs_info;
 +      struct bch_opts opts = bch2_opts_empty();
 +      int ret;
 +
 +      opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
 +
 +      ret = bch2_parse_mount_opts(c, &opts, data);
 +      if (ret)
 +              goto err;
 +
 +      if (opts.read_only != c->opts.read_only) {
 +              down_write(&c->state_lock);
 +
 +              if (opts.read_only) {
 +                      bch2_fs_read_only(c);
 +
 +                      sb->s_flags |= SB_RDONLY;
 +              } else {
 +                      ret = bch2_fs_read_write(c);
 +                      if (ret) {
 +                              bch_err(c, "error going rw: %i", ret);
 +                              up_write(&c->state_lock);
 +                              ret = -EINVAL;
 +                              goto err;
 +                      }
 +
 +                      sb->s_flags &= ~SB_RDONLY;
 +              }
 +
 +              c->opts.read_only = opts.read_only;
 +
 +              up_write(&c->state_lock);
 +      }
 +
 +      if (opt_defined(opts, errors))
 +              c->opts.errors = opts.errors;
 +err:
 +      return bch2_err_class(ret);
 +}
 +
 +static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
 +{
 +      struct bch_fs *c = root->d_sb->s_fs_info;
 +      struct bch_dev *ca;
 +      unsigned i;
 +      bool first = true;
 +
 +      for_each_online_member(ca, c, i) {
 +              if (!first)
 +                      seq_putc(seq, ':');
 +              first = false;
 +              seq_puts(seq, "/dev/");
 +              seq_puts(seq, ca->name);
 +      }
 +
 +      return 0;
 +}
 +
 +static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 +{
 +      struct bch_fs *c = root->d_sb->s_fs_info;
 +      enum bch_opt_id i;
 +      struct printbuf buf = PRINTBUF;
 +      int ret = 0;
 +
 +      for (i = 0; i < bch2_opts_nr; i++) {
 +              const struct bch_option *opt = &bch2_opt_table[i];
 +              u64 v = bch2_opt_get_by_id(&c->opts, i);
 +
 +              if (!(opt->flags & OPT_MOUNT))
 +                      continue;
 +
 +              if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
 +                      continue;
 +
 +              printbuf_reset(&buf);
 +              bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
 +                               OPT_SHOW_MOUNT_STYLE);
 +              seq_putc(seq, ',');
 +              seq_puts(seq, buf.buf);
 +      }
 +
 +      if (buf.allocation_failure)
 +              ret = -ENOMEM;
 +      printbuf_exit(&buf);
 +      return ret;
 +}
 +
 +static void bch2_put_super(struct super_block *sb)
 +{
 +      struct bch_fs *c = sb->s_fs_info;
 +
 +      __bch2_fs_stop(c);
 +}
 +
 +/*
 + * bcachefs doesn't currently integrate intwrite freeze protection but the
 + * internal write references serve the same purpose. Therefore reuse the
 + * read-only transition code to perform the quiesce. The caveat is that we don't
 + * currently have the ability to block tasks that want a write reference while
 + * the superblock is frozen. This is fine for now, but we should either add
 + * blocking support or find a way to integrate sb_start_intwrite() and friends.
 + */
 +static int bch2_freeze(struct super_block *sb)
 +{
 +      struct bch_fs *c = sb->s_fs_info;
 +
 +      down_write(&c->state_lock);
 +      bch2_fs_read_only(c);
 +      up_write(&c->state_lock);
 +      return 0;
 +}
 +
 +static int bch2_unfreeze(struct super_block *sb)
 +{
 +      struct bch_fs *c = sb->s_fs_info;
 +      int ret;
 +
 +      down_write(&c->state_lock);
 +      ret = bch2_fs_read_write(c);
 +      up_write(&c->state_lock);
 +      return ret;
 +}
 +
 +static const struct super_operations bch_super_operations = {
 +      .alloc_inode    = bch2_alloc_inode,
 +      .destroy_inode  = bch2_destroy_inode,
 +      .write_inode    = bch2_vfs_write_inode,
 +      .evict_inode    = bch2_evict_inode,
 +      .sync_fs        = bch2_sync_fs,
 +      .statfs         = bch2_statfs,
 +      .show_devname   = bch2_show_devname,
 +      .show_options   = bch2_show_options,
 +      .remount_fs     = bch2_remount,
 +      .put_super      = bch2_put_super,
 +      .freeze_fs      = bch2_freeze,
 +      .unfreeze_fs    = bch2_unfreeze,
 +};
 +
 +static int bch2_set_super(struct super_block *s, void *data)
 +{
 +      s->s_fs_info = data;
 +      return 0;
 +}
 +
 +static int bch2_noset_super(struct super_block *s, void *data)
 +{
 +      return -EBUSY;
 +}
 +
 +static int bch2_test_super(struct super_block *s, void *data)
 +{
 +      struct bch_fs *c = s->s_fs_info;
 +      struct bch_fs **devs = data;
 +      unsigned i;
 +
 +      if (!c)
 +              return false;
 +
 +      for (i = 0; devs[i]; i++)
 +              if (c != devs[i])
 +                      return false;
 +      return true;
 +}
 +
 +static struct dentry *bch2_mount(struct file_system_type *fs_type,
 +                               int flags, const char *dev_name, void *data)
 +{
 +      struct bch_fs *c;
 +      struct bch_dev *ca;
 +      struct super_block *sb;
 +      struct inode *vinode;
 +      struct bch_opts opts = bch2_opts_empty();
 +      char **devs;
 +      struct bch_fs **devs_to_fs = NULL;
 +      unsigned i, nr_devs;
 +      int ret;
 +
 +      opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
 +
 +      ret = bch2_parse_mount_opts(NULL, &opts, data);
 +      if (ret)
 +              return ERR_PTR(ret);
 +
 +      if (!dev_name || strlen(dev_name) == 0)
 +              return ERR_PTR(-EINVAL);
 +
 +      devs = split_devs(dev_name, &nr_devs);
 +      if (!devs)
 +              return ERR_PTR(-ENOMEM);
 +
 +      devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL);
 +      if (!devs_to_fs) {
 +              sb = ERR_PTR(-ENOMEM);
 +              goto got_sb;
 +      }
 +
 +      for (i = 0; i < nr_devs; i++)
 +              devs_to_fs[i] = bch2_path_to_fs(devs[i]);
 +
 +      sb = sget(fs_type, bch2_test_super, bch2_noset_super,
 +                flags|SB_NOSEC, devs_to_fs);
 +      if (!IS_ERR(sb))
 +              goto got_sb;
 +
 +      c = bch2_fs_open(devs, nr_devs, opts);
 +      if (IS_ERR(c)) {
 +              sb = ERR_CAST(c);
 +              goto got_sb;
 +      }
 +
 +      /* Some options can't be parsed until after the fs is started: */
 +      ret = bch2_parse_mount_opts(c, &opts, data);
 +      if (ret) {
 +              bch2_fs_stop(c);
 +              sb = ERR_PTR(ret);
 +              goto got_sb;
 +      }
 +
 +      bch2_opts_apply(&c->opts, opts);
 +
 +      sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
 +      if (IS_ERR(sb))
 +              bch2_fs_stop(c);
 +got_sb:
 +      kfree(devs_to_fs);
 +      kfree(devs[0]);
 +      kfree(devs);
 +
 +      if (IS_ERR(sb)) {
 +              ret = PTR_ERR(sb);
 +              ret = bch2_err_class(ret);
 +              return ERR_PTR(ret);
 +      }
 +
 +      c = sb->s_fs_info;
 +
 +      if (sb->s_root) {
 +              if ((flags ^ sb->s_flags) & SB_RDONLY) {
 +                      ret = -EBUSY;
 +                      goto err_put_super;
 +              }
 +              goto out;
 +      }
 +
 +      sb->s_blocksize         = block_bytes(c);
 +      sb->s_blocksize_bits    = ilog2(block_bytes(c));
 +      sb->s_maxbytes          = MAX_LFS_FILESIZE;
 +      sb->s_op                = &bch_super_operations;
 +      sb->s_export_op         = &bch_export_ops;
 +#ifdef CONFIG_BCACHEFS_QUOTA
 +      sb->s_qcop              = &bch2_quotactl_operations;
 +      sb->s_quota_types       = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
 +#endif
 +      sb->s_xattr             = bch2_xattr_handlers;
 +      sb->s_magic             = BCACHEFS_STATFS_MAGIC;
 +      sb->s_time_gran         = c->sb.nsec_per_time_unit;
 +      sb->s_time_min          = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
 +      sb->s_time_max          = div_s64(S64_MAX, c->sb.time_units_per_sec);
 +      c->vfs_sb               = sb;
 +      strscpy(sb->s_id, c->name, sizeof(sb->s_id));
 +
 +      ret = super_setup_bdi(sb);
 +      if (ret)
 +              goto err_put_super;
 +
 +      sb->s_bdi->ra_pages             = VM_READAHEAD_PAGES;
 +
 +      for_each_online_member(ca, c, i) {
 +              struct block_device *bdev = ca->disk_sb.bdev;
 +
 +              /* XXX: create an anonymous device for multi device filesystems */
 +              sb->s_bdev      = bdev;
 +              sb->s_dev       = bdev->bd_dev;
 +              percpu_ref_put(&ca->io_ref);
 +              break;
 +      }
 +
 +      c->dev = sb->s_dev;
 +
 +#ifdef CONFIG_BCACHEFS_POSIX_ACL
 +      if (c->opts.acl)
 +              sb->s_flags     |= SB_POSIXACL;
 +#endif
 +
++      sb->s_shrink->seeks = 0;
 +
 +      vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
 +      ret = PTR_ERR_OR_ZERO(vinode);
 +      if (ret) {
 +              bch_err_msg(c, ret, "mounting: error getting root inode");
 +              goto err_put_super;
 +      }
 +
 +      sb->s_root = d_make_root(vinode);
 +      if (!sb->s_root) {
 +              bch_err(c, "error mounting: error allocating root dentry");
 +              ret = -ENOMEM;
 +              goto err_put_super;
 +      }
 +
 +      sb->s_flags |= SB_ACTIVE;
 +out:
 +      return dget(sb->s_root);
 +
 +err_put_super:
 +      sb->s_fs_info = NULL;
 +      c->vfs_sb = NULL;
 +      deactivate_locked_super(sb);
 +      bch2_fs_stop(c);
 +      return ERR_PTR(bch2_err_class(ret));
 +}
 +
 +static void bch2_kill_sb(struct super_block *sb)
 +{
 +      struct bch_fs *c = sb->s_fs_info;
 +
 +      if (c)
 +              c->vfs_sb = NULL;
 +      generic_shutdown_super(sb);
 +      if (c)
 +              bch2_fs_free(c);
 +}
 +
 +static struct file_system_type bcache_fs_type = {
 +      .owner          = THIS_MODULE,
 +      .name           = "bcachefs",
 +      .mount          = bch2_mount,
 +      .kill_sb        = bch2_kill_sb,
 +      .fs_flags       = FS_REQUIRES_DEV,
 +};
 +
 +MODULE_ALIAS_FS("bcachefs");
 +
 +void bch2_vfs_exit(void)
 +{
 +      unregister_filesystem(&bcache_fs_type);
 +      kmem_cache_destroy(bch2_inode_cache);
 +}
 +
 +int __init bch2_vfs_init(void)
 +{
 +      int ret = -ENOMEM;
 +
 +      bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
 +      if (!bch2_inode_cache)
 +              goto err;
 +
 +      ret = register_filesystem(&bcache_fs_type);
 +      if (ret)
 +              goto err;
 +
 +      return 0;
 +err:
 +      bch2_vfs_exit();
 +      return ret;
 +}
 +
 +#endif /* NO_BCACHEFS_FS */
index eb764b9a4629696e9444103c272ad01ed07643c0,0000000000000000000000000000000000000000..397116966a7cd40ef629b98cf16670476ff583a6
mode 100644,000000..100644
--- /dev/null
@@@ -1,1031 -1,0 +1,1031 @@@
-               c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * bcache sysfs interfaces
 + *
 + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
 + * Copyright 2012 Google, Inc.
 + */
 +
 +#ifndef NO_BCACHEFS_SYSFS
 +
 +#include "bcachefs.h"
 +#include "alloc_background.h"
 +#include "alloc_foreground.h"
 +#include "sysfs.h"
 +#include "btree_cache.h"
 +#include "btree_io.h"
 +#include "btree_iter.h"
 +#include "btree_key_cache.h"
 +#include "btree_update.h"
 +#include "btree_update_interior.h"
 +#include "btree_gc.h"
 +#include "buckets.h"
 +#include "clock.h"
 +#include "disk_groups.h"
 +#include "ec.h"
 +#include "inode.h"
 +#include "journal.h"
 +#include "keylist.h"
 +#include "move.h"
 +#include "movinggc.h"
 +#include "nocow_locking.h"
 +#include "opts.h"
 +#include "rebalance.h"
 +#include "replicas.h"
 +#include "super-io.h"
 +#include "tests.h"
 +
 +#include <linux/blkdev.h>
 +#include <linux/sort.h>
 +#include <linux/sched/clock.h>
 +
 +#include "util.h"
 +
 +#define SYSFS_OPS(type)                                                       \
 +const struct sysfs_ops type ## _sysfs_ops = {                         \
 +      .show   = type ## _show,                                        \
 +      .store  = type ## _store                                        \
 +}
 +
 +#define SHOW(fn)                                                      \
 +static ssize_t fn ## _to_text(struct printbuf *,                      \
 +                            struct kobject *, struct attribute *);    \
 +                                                                      \
 +static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
 +                         char *buf)                                   \
 +{                                                                     \
 +      struct printbuf out = PRINTBUF;                                 \
 +      ssize_t ret = fn ## _to_text(&out, kobj, attr);                 \
 +                                                                      \
 +      if (out.pos && out.buf[out.pos - 1] != '\n')                    \
 +              prt_newline(&out);                                      \
 +                                                                      \
 +      if (!ret && out.allocation_failure)                             \
 +              ret = -ENOMEM;                                          \
 +                                                                      \
 +      if (!ret) {                                                     \
 +              ret = min_t(size_t, out.pos, PAGE_SIZE - 1);            \
 +              memcpy(buf, out.buf, ret);                              \
 +      }                                                               \
 +      printbuf_exit(&out);                                            \
 +      return bch2_err_class(ret);                                     \
 +}                                                                     \
 +                                                                      \
 +static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
 +                            struct attribute *attr)
 +
 +#define STORE(fn)                                                     \
 +static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\
 +                          const char *, size_t);                      \
 +                                                                      \
 +static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
 +                          const char *buf, size_t size)               \
 +{                                                                     \
 +      return bch2_err_class(fn##_store_inner(kobj, attr, buf, size)); \
 +}                                                                     \
 +                                                                      \
 +static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\
 +                                const char *buf, size_t size)
 +
 +#define __sysfs_attribute(_name, _mode)                                       \
 +      static struct attribute sysfs_##_name =                         \
 +              { .name = #_name, .mode = _mode }
 +
 +#define write_attribute(n)    __sysfs_attribute(n, 0200)
 +#define read_attribute(n)     __sysfs_attribute(n, 0444)
 +#define rw_attribute(n)               __sysfs_attribute(n, 0644)
 +
 +#define sysfs_printf(file, fmt, ...)                                  \
 +do {                                                                  \
 +      if (attr == &sysfs_ ## file)                                    \
 +              prt_printf(out, fmt "\n", __VA_ARGS__);                 \
 +} while (0)
 +
 +#define sysfs_print(file, var)                                                \
 +do {                                                                  \
 +      if (attr == &sysfs_ ## file)                                    \
 +              snprint(out, var);                                      \
 +} while (0)
 +
 +#define sysfs_hprint(file, val)                                               \
 +do {                                                                  \
 +      if (attr == &sysfs_ ## file)                                    \
 +              prt_human_readable_s64(out, val);                       \
 +} while (0)
 +
 +#define sysfs_strtoul(file, var)                                      \
 +do {                                                                  \
 +      if (attr == &sysfs_ ## file)                                    \
 +              return strtoul_safe(buf, var) ?: (ssize_t) size;        \
 +} while (0)
 +
 +#define sysfs_strtoul_clamp(file, var, min, max)                      \
 +do {                                                                  \
 +      if (attr == &sysfs_ ## file)                                    \
 +              return strtoul_safe_clamp(buf, var, min, max)           \
 +                      ?: (ssize_t) size;                              \
 +} while (0)
 +
 +#define strtoul_or_return(cp)                                         \
 +({                                                                    \
 +      unsigned long _v;                                               \
 +      int _r = kstrtoul(cp, 10, &_v);                                 \
 +      if (_r)                                                         \
 +              return _r;                                              \
 +      _v;                                                             \
 +})
 +
 +write_attribute(trigger_gc);
 +write_attribute(trigger_discards);
 +write_attribute(trigger_invalidates);
 +write_attribute(prune_cache);
 +write_attribute(btree_wakeup);
 +rw_attribute(btree_gc_periodic);
 +rw_attribute(gc_gens_pos);
 +
 +read_attribute(uuid);
 +read_attribute(minor);
 +read_attribute(bucket_size);
 +read_attribute(first_bucket);
 +read_attribute(nbuckets);
 +rw_attribute(durability);
 +read_attribute(iodone);
 +
 +read_attribute(io_latency_read);
 +read_attribute(io_latency_write);
 +read_attribute(io_latency_stats_read);
 +read_attribute(io_latency_stats_write);
 +read_attribute(congested);
 +
 +read_attribute(btree_write_stats);
 +
 +read_attribute(btree_cache_size);
 +read_attribute(compression_stats);
 +read_attribute(journal_debug);
 +read_attribute(btree_updates);
 +read_attribute(btree_cache);
 +read_attribute(btree_key_cache);
 +read_attribute(stripes_heap);
 +read_attribute(open_buckets);
 +read_attribute(open_buckets_partial);
 +read_attribute(write_points);
 +read_attribute(nocow_lock_table);
 +
 +#ifdef BCH_WRITE_REF_DEBUG
 +read_attribute(write_refs);
 +
 +static const char * const bch2_write_refs[] = {
 +#define x(n)  #n,
 +      BCH_WRITE_REFS()
 +#undef x
 +      NULL
 +};
 +
 +static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
 +{
 +      bch2_printbuf_tabstop_push(out, 24);
 +
 +      for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) {
 +              prt_str(out, bch2_write_refs[i]);
 +              prt_tab(out);
 +              prt_printf(out, "%li", atomic_long_read(&c->writes[i]));
 +              prt_newline(out);
 +      }
 +}
 +#endif
 +
 +read_attribute(internal_uuid);
 +read_attribute(disk_groups);
 +
 +read_attribute(has_data);
 +read_attribute(alloc_debug);
 +
 +#define x(t, n, ...) read_attribute(t);
 +BCH_PERSISTENT_COUNTERS()
 +#undef x
 +
 +rw_attribute(discard);
 +rw_attribute(label);
 +
 +rw_attribute(copy_gc_enabled);
 +read_attribute(copy_gc_wait);
 +
 +rw_attribute(rebalance_enabled);
 +sysfs_pd_controller_attribute(rebalance);
 +read_attribute(rebalance_work);
 +rw_attribute(promote_whole_extents);
 +
 +read_attribute(new_stripes);
 +
 +read_attribute(io_timers_read);
 +read_attribute(io_timers_write);
 +
 +read_attribute(moving_ctxts);
 +
 +#ifdef CONFIG_BCACHEFS_TESTS
 +write_attribute(perf_test);
 +#endif /* CONFIG_BCACHEFS_TESTS */
 +
 +#define x(_name)                                              \
 +      static struct attribute sysfs_time_stat_##_name =               \
 +              { .name = #_name, .mode = 0444 };
 +      BCH_TIME_STATS()
 +#undef x
 +
 +static struct attribute sysfs_state_rw = {
 +      .name = "state",
 +      .mode =  0444,
 +};
 +
 +static size_t bch2_btree_cache_size(struct bch_fs *c)
 +{
 +      size_t ret = 0;
 +      struct btree *b;
 +
 +      mutex_lock(&c->btree_cache.lock);
 +      list_for_each_entry(b, &c->btree_cache.live, list)
 +              ret += btree_bytes(c);
 +
 +      mutex_unlock(&c->btree_cache.lock);
 +      return ret;
 +}
 +
 +static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
 +{
 +      struct btree_trans *trans;
 +      struct btree_iter iter;
 +      struct bkey_s_c k;
 +      enum btree_id id;
 +      u64 nr_uncompressed_extents = 0,
 +          nr_compressed_extents = 0,
 +          nr_incompressible_extents = 0,
 +          uncompressed_sectors = 0,
 +          incompressible_sectors = 0,
 +          compressed_sectors_compressed = 0,
 +          compressed_sectors_uncompressed = 0;
 +      int ret = 0;
 +
 +      if (!test_bit(BCH_FS_STARTED, &c->flags))
 +              return -EPERM;
 +
 +      trans = bch2_trans_get(c);
 +
 +      for (id = 0; id < BTREE_ID_NR; id++) {
 +              if (!btree_type_has_ptrs(id))
 +                      continue;
 +
 +              for_each_btree_key(trans, iter, id, POS_MIN,
 +                                 BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
 +                      struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 +                      const union bch_extent_entry *entry;
 +                      struct extent_ptr_decoded p;
 +                      bool compressed = false, uncompressed = false, incompressible = false;
 +
 +                      bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 +                              switch (p.crc.compression_type) {
 +                              case BCH_COMPRESSION_TYPE_none:
 +                                      uncompressed = true;
 +                                      uncompressed_sectors += k.k->size;
 +                                      break;
 +                              case BCH_COMPRESSION_TYPE_incompressible:
 +                                      incompressible = true;
 +                                      incompressible_sectors += k.k->size;
 +                                      break;
 +                              default:
 +                                      compressed_sectors_compressed +=
 +                                              p.crc.compressed_size;
 +                                      compressed_sectors_uncompressed +=
 +                                              p.crc.uncompressed_size;
 +                                      compressed = true;
 +                                      break;
 +                              }
 +                      }
 +
 +                      if (incompressible)
 +                              nr_incompressible_extents++;
 +                      else if (uncompressed)
 +                              nr_uncompressed_extents++;
 +                      else if (compressed)
 +                              nr_compressed_extents++;
 +              }
 +              bch2_trans_iter_exit(trans, &iter);
 +      }
 +
 +      bch2_trans_put(trans);
 +
 +      if (ret)
 +              return ret;
 +
 +      prt_printf(out, "uncompressed:\n");
 +      prt_printf(out, "       nr extents:             %llu\n", nr_uncompressed_extents);
 +      prt_printf(out, "       size:                   ");
 +      prt_human_readable_u64(out, uncompressed_sectors << 9);
 +      prt_printf(out, "\n");
 +
 +      prt_printf(out, "compressed:\n");
 +      prt_printf(out, "       nr extents:             %llu\n", nr_compressed_extents);
 +      prt_printf(out, "       compressed size:        ");
 +      prt_human_readable_u64(out, compressed_sectors_compressed << 9);
 +      prt_printf(out, "\n");
 +      prt_printf(out, "       uncompressed size:      ");
 +      prt_human_readable_u64(out, compressed_sectors_uncompressed << 9);
 +      prt_printf(out, "\n");
 +
 +      prt_printf(out, "incompressible:\n");
 +      prt_printf(out, "       nr extents:             %llu\n", nr_incompressible_extents);
 +      prt_printf(out, "       size:                   ");
 +      prt_human_readable_u64(out, incompressible_sectors << 9);
 +      prt_printf(out, "\n");
 +      return 0;
 +}
 +
 +static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
 +{
 +      prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
 +      bch2_bpos_to_text(out, c->gc_gens_pos);
 +      prt_printf(out, "\n");
 +}
 +
 +static void bch2_btree_wakeup_all(struct bch_fs *c)
 +{
 +      struct btree_trans *trans;
 +
 +      seqmutex_lock(&c->btree_trans_lock);
 +      list_for_each_entry(trans, &c->btree_trans_list, list) {
 +              struct btree_bkey_cached_common *b = READ_ONCE(trans->locking);
 +
 +              if (b)
 +                      six_lock_wakeup_all(&b->lock);
 +
 +      }
 +      seqmutex_unlock(&c->btree_trans_lock);
 +}
 +
 +SHOW(bch2_fs)
 +{
 +      struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
 +
 +      sysfs_print(minor,                      c->minor);
 +      sysfs_printf(internal_uuid, "%pU",      c->sb.uuid.b);
 +
 +      sysfs_hprint(btree_cache_size,          bch2_btree_cache_size(c));
 +
 +      if (attr == &sysfs_btree_write_stats)
 +              bch2_btree_write_stats_to_text(out, c);
 +
 +      sysfs_printf(btree_gc_periodic, "%u",   (int) c->btree_gc_periodic);
 +
 +      if (attr == &sysfs_gc_gens_pos)
 +              bch2_gc_gens_pos_to_text(out, c);
 +
 +      sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
 +
 +      sysfs_printf(rebalance_enabled,         "%i", c->rebalance.enabled);
 +      sysfs_pd_controller_show(rebalance,     &c->rebalance.pd); /* XXX */
 +
 +      if (attr == &sysfs_copy_gc_wait)
 +              bch2_copygc_wait_to_text(out, c);
 +
 +      if (attr == &sysfs_rebalance_work)
 +              bch2_rebalance_work_to_text(out, c);
 +
 +      sysfs_print(promote_whole_extents,      c->promote_whole_extents);
 +
 +      /* Debugging: */
 +
 +      if (attr == &sysfs_journal_debug)
 +              bch2_journal_debug_to_text(out, &c->journal);
 +
 +      if (attr == &sysfs_btree_updates)
 +              bch2_btree_updates_to_text(out, c);
 +
 +      if (attr == &sysfs_btree_cache)
 +              bch2_btree_cache_to_text(out, c);
 +
 +      if (attr == &sysfs_btree_key_cache)
 +              bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
 +
 +      if (attr == &sysfs_stripes_heap)
 +              bch2_stripes_heap_to_text(out, c);
 +
 +      if (attr == &sysfs_open_buckets)
 +              bch2_open_buckets_to_text(out, c);
 +
 +      if (attr == &sysfs_open_buckets_partial)
 +              bch2_open_buckets_partial_to_text(out, c);
 +
 +      if (attr == &sysfs_write_points)
 +              bch2_write_points_to_text(out, c);
 +
 +      if (attr == &sysfs_compression_stats)
 +              bch2_compression_stats_to_text(out, c);
 +
 +      if (attr == &sysfs_new_stripes)
 +              bch2_new_stripes_to_text(out, c);
 +
 +      if (attr == &sysfs_io_timers_read)
 +              bch2_io_timers_to_text(out, &c->io_clock[READ]);
 +
 +      if (attr == &sysfs_io_timers_write)
 +              bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
 +
 +      if (attr == &sysfs_moving_ctxts)
 +              bch2_fs_moving_ctxts_to_text(out, c);
 +
 +#ifdef BCH_WRITE_REF_DEBUG
 +      if (attr == &sysfs_write_refs)
 +              bch2_write_refs_to_text(out, c);
 +#endif
 +
 +      if (attr == &sysfs_nocow_lock_table)
 +              bch2_nocow_locks_to_text(out, &c->nocow_locks);
 +
 +      if (attr == &sysfs_disk_groups)
 +              bch2_disk_groups_to_text(out, c);
 +
 +      return 0;
 +}
 +
 +STORE(bch2_fs)
 +{
 +      struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
 +
 +      if (attr == &sysfs_btree_gc_periodic) {
 +              ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
 +                      ?: (ssize_t) size;
 +
 +              wake_up_process(c->gc_thread);
 +              return ret;
 +      }
 +
 +      if (attr == &sysfs_copy_gc_enabled) {
 +              ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
 +                      ?: (ssize_t) size;
 +
 +              if (c->copygc_thread)
 +                      wake_up_process(c->copygc_thread);
 +              return ret;
 +      }
 +
 +      if (attr == &sysfs_rebalance_enabled) {
 +              ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
 +                      ?: (ssize_t) size;
 +
 +              rebalance_wakeup(c);
 +              return ret;
 +      }
 +
 +      sysfs_pd_controller_store(rebalance,    &c->rebalance.pd);
 +
 +      sysfs_strtoul(promote_whole_extents,    c->promote_whole_extents);
 +
 +      /* Debugging: */
 +
 +      if (!test_bit(BCH_FS_STARTED, &c->flags))
 +              return -EPERM;
 +
 +      /* Debugging: */
 +
 +      if (!test_bit(BCH_FS_RW, &c->flags))
 +              return -EROFS;
 +
 +      if (attr == &sysfs_prune_cache) {
 +              struct shrink_control sc;
 +
 +              sc.gfp_mask = GFP_KERNEL;
 +              sc.nr_to_scan = strtoul_or_return(buf);
++              c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
 +      }
 +
 +      if (attr == &sysfs_btree_wakeup)
 +              bch2_btree_wakeup_all(c);
 +
 +      if (attr == &sysfs_trigger_gc) {
 +              /*
 +               * Full gc is currently incompatible with btree key cache:
 +               */
 +#if 0
 +              down_read(&c->state_lock);
 +              bch2_gc(c, false, false);
 +              up_read(&c->state_lock);
 +#else
 +              bch2_gc_gens(c);
 +#endif
 +      }
 +
 +      if (attr == &sysfs_trigger_discards)
 +              bch2_do_discards(c);
 +
 +      if (attr == &sysfs_trigger_invalidates)
 +              bch2_do_invalidates(c);
 +
 +#ifdef CONFIG_BCACHEFS_TESTS
 +      if (attr == &sysfs_perf_test) {
 +              char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
 +              char *test              = strsep(&p, " \t\n");
 +              char *nr_str            = strsep(&p, " \t\n");
 +              char *threads_str       = strsep(&p, " \t\n");
 +              unsigned threads;
 +              u64 nr;
 +              int ret = -EINVAL;
 +
 +              if (threads_str &&
 +                  !(ret = kstrtouint(threads_str, 10, &threads)) &&
 +                  !(ret = bch2_strtoull_h(nr_str, &nr)))
 +                      ret = bch2_btree_perf_test(c, test, nr, threads);
 +              kfree(tmp);
 +
 +              if (ret)
 +                      size = ret;
 +      }
 +#endif
 +      return size;
 +}
 +SYSFS_OPS(bch2_fs);
 +
 +struct attribute *bch2_fs_files[] = {
 +      &sysfs_minor,
 +      &sysfs_btree_cache_size,
 +      &sysfs_btree_write_stats,
 +
 +      &sysfs_promote_whole_extents,
 +
 +      &sysfs_compression_stats,
 +
 +#ifdef CONFIG_BCACHEFS_TESTS
 +      &sysfs_perf_test,
 +#endif
 +      NULL
 +};
 +
 +/* counters dir */
 +
 +SHOW(bch2_fs_counters)
 +{
 +      struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj);
 +      u64 counter = 0;
 +      u64 counter_since_mount = 0;
 +
 +      printbuf_tabstop_push(out, 32);
 +
 +      #define x(t, ...) \
 +              if (attr == &sysfs_##t) {                                       \
 +                      counter             = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
 +                      counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
 +                      prt_printf(out, "since mount:");                                \
 +                      prt_tab(out);                                           \
 +                      prt_human_readable_u64(out, counter_since_mount);       \
 +                      prt_newline(out);                                       \
 +                                                                              \
 +                      prt_printf(out, "since filesystem creation:");          \
 +                      prt_tab(out);                                           \
 +                      prt_human_readable_u64(out, counter);                   \
 +                      prt_newline(out);                                       \
 +              }
 +      BCH_PERSISTENT_COUNTERS()
 +      #undef x
 +      return 0;
 +}
 +
 +STORE(bch2_fs_counters) {
 +      return 0;
 +}
 +
 +SYSFS_OPS(bch2_fs_counters);
 +
 +struct attribute *bch2_fs_counters_files[] = {
 +#define x(t, ...) \
 +      &sysfs_##t,
 +      BCH_PERSISTENT_COUNTERS()
 +#undef x
 +      NULL
 +};
 +/* internal dir - just a wrapper */
 +
 +SHOW(bch2_fs_internal)
 +{
 +      struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
 +
 +      return bch2_fs_to_text(out, &c->kobj, attr);
 +}
 +
 +STORE(bch2_fs_internal)
 +{
 +      struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
 +
 +      return bch2_fs_store(&c->kobj, attr, buf, size);
 +}
 +SYSFS_OPS(bch2_fs_internal);
 +
 +struct attribute *bch2_fs_internal_files[] = {
 +      &sysfs_journal_debug,
 +      &sysfs_btree_updates,
 +      &sysfs_btree_cache,
 +      &sysfs_btree_key_cache,
 +      &sysfs_new_stripes,
 +      &sysfs_stripes_heap,
 +      &sysfs_open_buckets,
 +      &sysfs_open_buckets_partial,
 +      &sysfs_write_points,
 +#ifdef BCH_WRITE_REF_DEBUG
 +      &sysfs_write_refs,
 +#endif
 +      &sysfs_nocow_lock_table,
 +      &sysfs_io_timers_read,
 +      &sysfs_io_timers_write,
 +
 +      &sysfs_trigger_gc,
 +      &sysfs_trigger_discards,
 +      &sysfs_trigger_invalidates,
 +      &sysfs_prune_cache,
 +      &sysfs_btree_wakeup,
 +
 +      &sysfs_gc_gens_pos,
 +
 +      &sysfs_copy_gc_enabled,
 +      &sysfs_copy_gc_wait,
 +
 +      &sysfs_rebalance_enabled,
 +      &sysfs_rebalance_work,
 +      sysfs_pd_controller_files(rebalance),
 +
 +      &sysfs_moving_ctxts,
 +
 +      &sysfs_internal_uuid,
 +
 +      &sysfs_disk_groups,
 +      NULL
 +};
 +
 +/* options */
 +
 +SHOW(bch2_fs_opts_dir)
 +{
 +      struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
 +      const struct bch_option *opt = container_of(attr, struct bch_option, attr);
 +      int id = opt - bch2_opt_table;
 +      u64 v = bch2_opt_get_by_id(&c->opts, id);
 +
 +      bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
 +      prt_char(out, '\n');
 +
 +      return 0;
 +}
 +
 +STORE(bch2_fs_opts_dir)
 +{
 +      struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
 +      const struct bch_option *opt = container_of(attr, struct bch_option, attr);
 +      int ret, id = opt - bch2_opt_table;
 +      char *tmp;
 +      u64 v;
 +
 +      /*
 +       * We don't need to take c->writes for correctness, but it eliminates an
 +       * unsightly error message in the dmesg log when we're RO:
 +       */
 +      if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)))
 +              return -EROFS;
 +
 +      tmp = kstrdup(buf, GFP_KERNEL);
 +      if (!tmp) {
 +              ret = -ENOMEM;
 +              goto err;
 +      }
 +
 +      ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL);
 +      kfree(tmp);
 +
 +      if (ret < 0)
 +              goto err;
 +
 +      ret = bch2_opt_check_may_set(c, id, v);
 +      if (ret < 0)
 +              goto err;
 +
 +      bch2_opt_set_sb(c, opt, v);
 +      bch2_opt_set_by_id(&c->opts, id, v);
 +
 +      if ((id == Opt_background_target ||
 +           id == Opt_background_compression) && v) {
 +              bch2_rebalance_add_work(c, S64_MAX);
 +              rebalance_wakeup(c);
 +      }
 +
 +      ret = size;
 +err:
 +      bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
 +      return ret;
 +}
 +SYSFS_OPS(bch2_fs_opts_dir);
 +
 +struct attribute *bch2_fs_opts_dir_files[] = { NULL };
 +
 +int bch2_opts_create_sysfs_files(struct kobject *kobj)
 +{
 +      const struct bch_option *i;
 +      int ret;
 +
 +      for (i = bch2_opt_table;
 +           i < bch2_opt_table + bch2_opts_nr;
 +           i++) {
 +              if (!(i->flags & OPT_FS))
 +                      continue;
 +
 +              ret = sysfs_create_file(kobj, &i->attr);
 +              if (ret)
 +                      return ret;
 +      }
 +
 +      return 0;
 +}
 +
 +/* time stats */
 +
 +SHOW(bch2_fs_time_stats)
 +{
 +      struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
 +
 +#define x(name)                                                               \
 +      if (attr == &sysfs_time_stat_##name)                            \
 +              bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]);
 +      BCH_TIME_STATS()
 +#undef x
 +
 +      return 0;
 +}
 +
 +STORE(bch2_fs_time_stats)
 +{
 +      return size;
 +}
 +SYSFS_OPS(bch2_fs_time_stats);
 +
 +struct attribute *bch2_fs_time_stats_files[] = {
 +#define x(name)                                               \
 +      &sysfs_time_stat_##name,
 +      BCH_TIME_STATS()
 +#undef x
 +      NULL
 +};
 +
 +static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 +{
 +      struct bch_fs *c = ca->fs;
 +      struct bch_dev_usage stats = bch2_dev_usage_read(ca);
 +      unsigned i, nr[BCH_DATA_NR];
 +
 +      memset(nr, 0, sizeof(nr));
 +
 +      for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
 +              nr[c->open_buckets[i].data_type]++;
 +
 +      printbuf_tabstop_push(out, 8);
 +      printbuf_tabstop_push(out, 16);
 +      printbuf_tabstop_push(out, 16);
 +      printbuf_tabstop_push(out, 16);
 +      printbuf_tabstop_push(out, 16);
 +
 +      prt_tab(out);
 +      prt_str(out, "buckets");
 +      prt_tab_rjust(out);
 +      prt_str(out, "sectors");
 +      prt_tab_rjust(out);
 +      prt_str(out, "fragmented");
 +      prt_tab_rjust(out);
 +      prt_newline(out);
 +
 +      for (i = 0; i < BCH_DATA_NR; i++) {
 +              prt_str(out, bch2_data_types[i]);
 +              prt_tab(out);
 +              prt_u64(out, stats.d[i].buckets);
 +              prt_tab_rjust(out);
 +              prt_u64(out, stats.d[i].sectors);
 +              prt_tab_rjust(out);
 +              prt_u64(out, stats.d[i].fragmented);
 +              prt_tab_rjust(out);
 +              prt_newline(out);
 +      }
 +
 +      prt_str(out, "ec");
 +      prt_tab(out);
 +      prt_u64(out, stats.buckets_ec);
 +      prt_tab_rjust(out);
 +      prt_newline(out);
 +
 +      prt_newline(out);
 +
 +      prt_printf(out, "reserves:");
 +      prt_newline(out);
 +      for (i = 0; i < BCH_WATERMARK_NR; i++) {
 +              prt_str(out, bch2_watermarks[i]);
 +              prt_tab(out);
 +              prt_u64(out, bch2_dev_buckets_reserved(ca, i));
 +              prt_tab_rjust(out);
 +              prt_newline(out);
 +      }
 +
 +      prt_newline(out);
 +
 +      printbuf_tabstops_reset(out);
 +      printbuf_tabstop_push(out, 24);
 +
 +      prt_str(out, "freelist_wait");
 +      prt_tab(out);
 +      prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty");
 +      prt_newline(out);
 +
 +      prt_str(out, "open buckets allocated");
 +      prt_tab(out);
 +      prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
 +      prt_newline(out);
 +
 +      prt_str(out, "open buckets this dev");
 +      prt_tab(out);
 +      prt_u64(out, ca->nr_open_buckets);
 +      prt_newline(out);
 +
 +      prt_str(out, "open buckets total");
 +      prt_tab(out);
 +      prt_u64(out, OPEN_BUCKETS_COUNT);
 +      prt_newline(out);
 +
 +      prt_str(out, "open_buckets_wait");
 +      prt_tab(out);
 +      prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty");
 +      prt_newline(out);
 +
 +      prt_str(out, "open_buckets_btree");
 +      prt_tab(out);
 +      prt_u64(out, nr[BCH_DATA_btree]);
 +      prt_newline(out);
 +
 +      prt_str(out, "open_buckets_user");
 +      prt_tab(out);
 +      prt_u64(out, nr[BCH_DATA_user]);
 +      prt_newline(out);
 +
 +      prt_str(out, "buckets_to_invalidate");
 +      prt_tab(out);
 +      prt_u64(out, should_invalidate_buckets(ca, stats));
 +      prt_newline(out);
 +
 +      prt_str(out, "btree reserve cache");
 +      prt_tab(out);
 +      prt_u64(out, c->btree_reserve_cache_nr);
 +      prt_newline(out);
 +}
 +
 +static const char * const bch2_rw[] = {
 +      "read",
 +      "write",
 +      NULL
 +};
 +
 +static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca)
 +{
 +      int rw, i;
 +
 +      for (rw = 0; rw < 2; rw++) {
 +              prt_printf(out, "%s:\n", bch2_rw[rw]);
 +
 +              for (i = 1; i < BCH_DATA_NR; i++)
 +                      prt_printf(out, "%-12s:%12llu\n",
 +                             bch2_data_types[i],
 +                             percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
 +      }
 +}
 +
 +SHOW(bch2_dev)
 +{
 +      struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
 +      struct bch_fs *c = ca->fs;
 +
 +      sysfs_printf(uuid,              "%pU\n", ca->uuid.b);
 +
 +      sysfs_print(bucket_size,        bucket_bytes(ca));
 +      sysfs_print(first_bucket,       ca->mi.first_bucket);
 +      sysfs_print(nbuckets,           ca->mi.nbuckets);
 +      sysfs_print(durability,         ca->mi.durability);
 +      sysfs_print(discard,            ca->mi.discard);
 +
 +      if (attr == &sysfs_label) {
 +              if (ca->mi.group) {
 +                      mutex_lock(&c->sb_lock);
 +                      bch2_disk_path_to_text(out, c->disk_sb.sb,
 +                                             ca->mi.group - 1);
 +                      mutex_unlock(&c->sb_lock);
 +              }
 +
 +              prt_char(out, '\n');
 +      }
 +
 +      if (attr == &sysfs_has_data) {
 +              prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca));
 +              prt_char(out, '\n');
 +      }
 +
 +      if (attr == &sysfs_state_rw) {
 +              prt_string_option(out, bch2_member_states, ca->mi.state);
 +              prt_char(out, '\n');
 +      }
 +
 +      if (attr == &sysfs_iodone)
 +              dev_iodone_to_text(out, ca);
 +
 +      sysfs_print(io_latency_read,            atomic64_read(&ca->cur_latency[READ]));
 +      sysfs_print(io_latency_write,           atomic64_read(&ca->cur_latency[WRITE]));
 +
 +      if (attr == &sysfs_io_latency_stats_read)
 +              bch2_time_stats_to_text(out, &ca->io_latency[READ]);
 +
 +      if (attr == &sysfs_io_latency_stats_write)
 +              bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
 +
 +      sysfs_printf(congested,                 "%u%%",
 +                   clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
 +                   * 100 / CONGESTED_MAX);
 +
 +      if (attr == &sysfs_alloc_debug)
 +              dev_alloc_debug_to_text(out, ca);
 +
 +      return 0;
 +}
 +
 +STORE(bch2_dev)
 +{
 +      struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
 +      struct bch_fs *c = ca->fs;
 +      struct bch_member *mi;
 +
 +      if (attr == &sysfs_discard) {
 +              bool v = strtoul_or_return(buf);
 +
 +              mutex_lock(&c->sb_lock);
 +              mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
 +
 +              if (v != BCH_MEMBER_DISCARD(mi)) {
 +                      SET_BCH_MEMBER_DISCARD(mi, v);
 +                      bch2_write_super(c);
 +              }
 +              mutex_unlock(&c->sb_lock);
 +      }
 +
 +      if (attr == &sysfs_durability) {
 +              u64 v = strtoul_or_return(buf);
 +
 +              mutex_lock(&c->sb_lock);
 +              mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
 +
 +              if (v + 1 != BCH_MEMBER_DURABILITY(mi)) {
 +                      SET_BCH_MEMBER_DURABILITY(mi, v + 1);
 +                      bch2_write_super(c);
 +              }
 +              mutex_unlock(&c->sb_lock);
 +      }
 +
 +      if (attr == &sysfs_label) {
 +              char *tmp;
 +              int ret;
 +
 +              tmp = kstrdup(buf, GFP_KERNEL);
 +              if (!tmp)
 +                      return -ENOMEM;
 +
 +              ret = bch2_dev_group_set(c, ca, strim(tmp));
 +              kfree(tmp);
 +              if (ret)
 +                      return ret;
 +      }
 +
 +      return size;
 +}
 +SYSFS_OPS(bch2_dev);
 +
 +struct attribute *bch2_dev_files[] = {
 +      &sysfs_uuid,
 +      &sysfs_bucket_size,
 +      &sysfs_first_bucket,
 +      &sysfs_nbuckets,
 +      &sysfs_durability,
 +
 +      /* settings: */
 +      &sysfs_discard,
 +      &sysfs_state_rw,
 +      &sysfs_label,
 +
 +      &sysfs_has_data,
 +      &sysfs_iodone,
 +
 +      &sysfs_io_latency_read,
 +      &sysfs_io_latency_write,
 +      &sysfs_io_latency_stats_read,
 +      &sysfs_io_latency_stats_write,
 +      &sysfs_congested,
 +
 +      /* debug: */
 +      &sysfs_alloc_debug,
 +      NULL
 +};
 +
 +#endif  /* _BCACHEFS_SYSFS_H_ */
Simple merge
Simple merge
diff --cc fs/ext4/ext4.h
Simple merge
Simple merge
diff --cc fs/ext4/inode.c
Simple merge
diff --cc fs/ext4/super.c
Simple merge
diff --cc fs/f2fs/super.c
Simple merge
diff --cc fs/gfs2/bmap.c
Simple merge
diff --cc fs/gfs2/glock.c
Simple merge
diff --cc fs/gfs2/quota.c
Simple merge
Simple merge
Simple merge
diff --cc fs/nfs/super.c
Simple merge
Simple merge
Simple merge
diff --cc fs/ntfs3/file.c
Simple merge
diff --cc fs/ocfs2/aops.c
Simple merge
Simple merge
Simple merge
Simple merge
diff --cc fs/super.c
Simple merge
Simple merge
diff --cc fs/ufs/inode.c
Simple merge
index 003e157241da1e39a888f351f70d33b94f25e8b2,9e7ba04572db0526db853e4922605077fbefa129..545c7991b9b584cb576d33d55cdd67826e0fa753
@@@ -1945,7 -1943,9 +1943,7 @@@ voi
  xfs_free_buftarg(
        struct xfs_buftarg      *btp)
  {
-       unregister_shrinker(&btp->bt_shrinker);
 -      struct block_device     *bdev = btp->bt_bdev;
 -
+       shrinker_free(btp->bt_shrinker);
        ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
        percpu_counter_destroy(&btp->bt_io_count);
        list_lru_destroy(&btp->bt_lru);
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
diff --cc kernel/exit.c
Simple merge
diff --cc kernel/fork.c
Simple merge
Simple merge
Simple merge
diff --cc mm/mempolicy.c
Simple merge
diff --cc mm/mmap.c
Simple merge
diff --cc mm/nommu.c
Simple merge
diff --cc mm/percpu.c
Simple merge
diff --cc mm/shmem.c
index 6b102965d355f6865693c7ea91a5f6386970388b,a314a25aea8cceea80b920eebd1d89734941f333..71b8d957b63bec8384feb8c369289afcd95d65b4
@@@ -3339,11 -3349,11 +3349,11 @@@ static int shmem_unlink(struct inode *d
        simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
  
        dir->i_size -= BOGO_DIRENT_SIZE;
 -      dir->i_mtime = inode_set_ctime_to_ts(dir,
 -                                           inode_set_ctime_current(inode));
 +      inode_set_mtime_to_ts(dir,
 +                            inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
        inode_inc_iversion(dir);
        drop_nlink(inode);
-       dput(dentry);   /* Undo the count from "create" - this does all the work */
+       dput(dentry);   /* Undo the count from "create" - does all the work */
        return 0;
  }
  
diff --cc mm/util.c
Simple merge
Simple merge
index 9429d361059e0a07cf0d34c872c99903bdb347be,1c61e3c022cb84e3b6f1b49bafd70af12fa5239d..3c9bf0cd82a80dfe4189e273efbd9693f4f61b22
@@@ -115,210 -114,97 +115,223 @@@ static bool test_clone3(uint64_t flags
        ret = call_clone3(flags, size, test_mode);
        ksft_print_msg("[%d] clone3() with flags says: %d expected %d\n",
                        getpid(), ret, expected);
 -      if (ret != expected)
 -              ksft_test_result_fail(
 +      if (ret != expected) {
 +              ksft_print_msg(
                        "[%d] Result (%d) is different than expected (%d)\n",
                        getpid(), ret, expected);
 -      else
 -              ksft_test_result_pass(
 -                      "[%d] Result (%d) matches expectation (%d)\n",
 -                      getpid(), ret, expected);
 -}
 -
 -int main(int argc, char *argv[])
 -{
 -      uid_t uid = getuid();
 -
 -      ksft_print_header();
 -      ksft_set_plan(19);
 -      test_clone3_supported();
 -
 -      /* Just a simple clone3() should return 0.*/
 -      test_clone3(0, 0, 0, CLONE3_ARGS_NO_TEST);
 -
 -      /* Do a clone3() in a new PID NS.*/
 -      if (uid == 0)
 -              test_clone3(CLONE_NEWPID, 0, 0, CLONE3_ARGS_NO_TEST);
 -      else
 -              ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n");
 +              return false;
 +      }
  
 -      /* Do a clone3() with CLONE_ARGS_SIZE_VER0. */
 -      test_clone3(0, CLONE_ARGS_SIZE_VER0, 0, CLONE3_ARGS_NO_TEST);
 +      return true;
 +}
  
 -      /* Do a clone3() with CLONE_ARGS_SIZE_VER0 - 8 */
 -      test_clone3(0, CLONE_ARGS_SIZE_VER0 - 8, -EINVAL, CLONE3_ARGS_NO_TEST);
 +typedef bool (*filter_function)(void);
 +typedef size_t (*size_function)(void);
  
 -      /* Do a clone3() with sizeof(struct clone_args) + 8 */
 -      test_clone3(0, sizeof(struct __clone_args) + 8, 0, CLONE3_ARGS_NO_TEST);
 +static bool not_root(void)
 +{
 +      if (getuid() != 0) {
 +              ksft_print_msg("Not running as root\n");
 +              return true;
 +      }
  
 -      /* Do a clone3() with exit_signal having highest 32 bits non-zero */
 -      test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG);
 +      return false;
 +}
  
 -      /* Do a clone3() with negative 32-bit exit_signal */
 -      test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG);
++static bool no_timenamespace(void)
++{
++      if (not_root())
++              return true;
 -      /* Do a clone3() with exit_signal not fitting into CSIGNAL mask */
 -      test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG);
++      if (!access("/proc/self/ns/time", F_OK))
++              return false;
 -      /* Do a clone3() with NSIG < exit_signal < CSIG */
 -      test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG);
++      ksft_print_msg("Time namespaces are not supported\n");
++      return true;
++}
 -      test_clone3(0, sizeof(struct __clone_args) + 8, 0, CLONE3_ARGS_ALL_0);
 +static size_t page_size_plus_8(void)
 +{
 +      return getpagesize() + 8;
 +}
  
 -      test_clone3(0, sizeof(struct __clone_args) + 16, -E2BIG,
 -                      CLONE3_ARGS_ALL_0);
 +struct test {
 +      const char *name;
 +      uint64_t flags;
 +      size_t size;
 +      size_function size_function;
 +      int expected;
 +      enum test_mode test_mode;
 +      filter_function filter;
 +};
  
 -      test_clone3(0, sizeof(struct __clone_args) * 2, -E2BIG,
 -                      CLONE3_ARGS_ALL_0);
 +static const struct test tests[] = {
 +      {
 +              .name = "simple clone3()",
 +              .flags = 0,
 +              .size = 0,
 +              .expected = 0,
 +              .test_mode = CLONE3_ARGS_NO_TEST,
 +      },
 +      {
 +              .name = "clone3() in a new PID_NS",
 +              .flags = CLONE_NEWPID,
 +              .size = 0,
 +              .expected = 0,
 +              .test_mode = CLONE3_ARGS_NO_TEST,
 +              .filter = not_root,
 +      },
 +      {
 +              .name = "CLONE_ARGS_SIZE_VER0",
 +              .flags = 0,
 +              .size = CLONE_ARGS_SIZE_VER0,
 +              .expected = 0,
 +              .test_mode = CLONE3_ARGS_NO_TEST,
 +      },
 +      {
 +              .name = "CLONE_ARGS_SIZE_VER0 - 8",
 +              .flags = 0,
 +              .size = CLONE_ARGS_SIZE_VER0 - 8,
 +              .expected = -EINVAL,
 +              .test_mode = CLONE3_ARGS_NO_TEST,
 +      },
 +      {
 +              .name = "sizeof(struct clone_args) + 8",
 +              .flags = 0,
 +              .size = sizeof(struct __clone_args) + 8,
 +              .expected = 0,
 +              .test_mode = CLONE3_ARGS_NO_TEST,
 +      },
 +      {
 +              .name = "exit_signal with highest 32 bits non-zero",
 +              .flags = 0,
 +              .size = 0,
 +              .expected = -EINVAL,
 +              .test_mode = CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG,
 +      },
 +      {
 +              .name = "negative 32-bit exit_signal",
 +              .flags = 0,
 +              .size = 0,
 +              .expected = -EINVAL,
 +              .test_mode = CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG,
 +      },
 +      {
 +              .name = "exit_signal not fitting into CSIGNAL mask",
 +              .flags = 0,
 +              .size = 0,
 +              .expected = -EINVAL,
 +              .test_mode = CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG,
 +      },
 +      {
 +              .name = "NSIG < exit_signal < CSIG",
 +              .flags = 0,
 +              .size = 0,
 +              .expected = -EINVAL,
 +              .test_mode = CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG,
 +      },
 +      {
 +              .name = "Arguments sizeof(struct clone_args) + 8",
 +              .flags = 0,
 +              .size = sizeof(struct __clone_args) + 8,
 +              .expected = 0,
 +              .test_mode = CLONE3_ARGS_ALL_0,
 +      },
 +      {
 +              .name = "Arguments sizeof(struct clone_args) + 16",
 +              .flags = 0,
 +              .size = sizeof(struct __clone_args) + 16,
 +              .expected = -E2BIG,
 +              .test_mode = CLONE3_ARGS_ALL_0,
 +      },
 +      {
 +              .name = "Arguments sizeof(struct clone_arg) * 2",
 +              .flags = 0,
 +              .size = sizeof(struct __clone_args) + 16,
 +              .expected = -E2BIG,
 +              .test_mode = CLONE3_ARGS_ALL_0,
 +      },
 +      {
 +              .name = "Arguments > page size",
 +              .flags = 0,
 +              .size_function = page_size_plus_8,
 +              .expected = -E2BIG,
 +              .test_mode = CLONE3_ARGS_NO_TEST,
 +      },
 +      {
 +              .name = "CLONE_ARGS_SIZE_VER0 in a new PID NS",
 +              .flags = CLONE_NEWPID,
 +              .size = CLONE_ARGS_SIZE_VER0,
 +              .expected = 0,
 +              .test_mode = CLONE3_ARGS_NO_TEST,
 +              .filter = not_root,
 +      },
 +      {
 +              .name = "CLONE_ARGS_SIZE_VER0 - 8 in a new PID NS",
 +              .flags = CLONE_NEWPID,
 +              .size = CLONE_ARGS_SIZE_VER0 - 8,
 +              .expected = -EINVAL,
 +              .test_mode = CLONE3_ARGS_NO_TEST,
 +      },
 +      {
 +              .name = "sizeof(struct clone_args) + 8 in a new PID NS",
 +              .flags = CLONE_NEWPID,
 +              .size = sizeof(struct __clone_args) + 8,
 +              .expected = 0,
 +              .test_mode = CLONE3_ARGS_NO_TEST,
 +              .filter = not_root,
 +      },
 +      {
 +              .name = "Arguments > page size in a new PID NS",
 +              .flags = CLONE_NEWPID,
 +              .size_function = page_size_plus_8,
 +              .expected = -E2BIG,
 +              .test_mode = CLONE3_ARGS_NO_TEST,
 +      },
 +      {
 +              .name = "New time NS",
 +              .flags = CLONE_NEWTIME,
 +              .size = 0,
 +              .expected = 0,
 +              .test_mode = CLONE3_ARGS_NO_TEST,
++              .filter = no_timenamespace,
 +      },
 +      {
 +              .name = "exit signal (SIGCHLD) in flags",
 +              .flags = SIGCHLD,
 +              .size = 0,
 +              .expected = -EINVAL,
 +              .test_mode = CLONE3_ARGS_NO_TEST,
 +      },
 +};
  
 -      /* Do a clone3() with > page size */
 -      test_clone3(0, getpagesize() + 8, -E2BIG, CLONE3_ARGS_NO_TEST);
 +int main(int argc, char *argv[])
 +{
 +      size_t size;
 +      int i;
  
 -      /* Do a clone3() with CLONE_ARGS_SIZE_VER0 in a new PID NS. */
 -      if (uid == 0)
 -              test_clone3(CLONE_NEWPID, CLONE_ARGS_SIZE_VER0, 0,
 -                              CLONE3_ARGS_NO_TEST);
 -      else
 -              ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n");
 +      ksft_print_header();
 +      ksft_set_plan(ARRAY_SIZE(tests));
 +      test_clone3_supported();
  
 -      /* Do a clone3() with CLONE_ARGS_SIZE_VER0 - 8 in a new PID NS */
 -      test_clone3(CLONE_NEWPID, CLONE_ARGS_SIZE_VER0 - 8, -EINVAL,
 -                      CLONE3_ARGS_NO_TEST);
 +      for (i = 0; i < ARRAY_SIZE(tests); i++) {
 +              if (tests[i].filter && tests[i].filter()) {
 +                      ksft_test_result_skip("%s\n", tests[i].name);
 +                      continue;
 +              }
  
 -      /* Do a clone3() with sizeof(struct clone_args) + 8 in a new PID NS */
 -      if (uid == 0)
 -              test_clone3(CLONE_NEWPID, sizeof(struct __clone_args) + 8, 0,
 -                              CLONE3_ARGS_NO_TEST);
 -      else
 -              ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n");
 +              if (tests[i].size_function)
 +                      size = tests[i].size_function();
 +              else
 +                      size = tests[i].size;
  
 -      /* Do a clone3() with > page size in a new PID NS */
 -      test_clone3(CLONE_NEWPID, getpagesize() + 8, -E2BIG,
 -                      CLONE3_ARGS_NO_TEST);
 +              ksft_print_msg("Running test '%s'\n", tests[i].name);
  
 -      /* Do a clone3() in a new time namespace */
 -      if (access("/proc/self/ns/time", F_OK) == 0) {
 -              test_clone3(CLONE_NEWTIME, 0, 0, CLONE3_ARGS_NO_TEST);
 -      } else {
 -              ksft_print_msg("Time namespaces are not supported\n");
 -              ksft_test_result_skip("Skipping clone3() with CLONE_NEWTIME\n");
 +              ksft_test_result(test_clone3(tests[i].flags, size,
 +                                           tests[i].expected,
 +                                           tests[i].test_mode),
 +                               "%s\n", tests[i].name);
        }
  
 -      /* Do a clone3() with exit signal (SIGCHLD) in flags */
 -      test_clone3(SIGCHLD, 0, -EINVAL, CLONE3_ARGS_NO_TEST);
 -
        ksft_finished();
  }
index 60a9a305aef071ee7aad5210fe456701d240c28a,56f0230a8b92d37a4eb499ba69062f16393b02d4..56f0230a8b92d37a4eb499ba69062f16393b02d4
mode 100755,100644..100755