Merge tag 'mm-stable-2023-11-01-14-33' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 3 Nov 2023 05:38:47 +0000 (19:38 -1000)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 3 Nov 2023 05:38:47 +0000 (19:38 -1000)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 3 Nov 2023 05:38:47 +0000 (19:38 -1000)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 3 Nov 2023 05:38:47 +0000 (19:38 -1000)
diff --cc Documentation/admin-guide/cgroup-v2.rst
Simple merge
diff --cc MAINTAINERS
Simple merge
diff --cc arch/arm64/kernel/mte.c
Simple merge
diff --cc arch/x86/include/asm/bitops.h
Simple merge
diff --cc arch/x86/kvm/mmu/mmu.c
Simple merge
diff --cc drivers/acpi/acpi_pad.c
Simple merge
diff --cc drivers/firmware/efi/unaccepted_memory.c

index 135278ddaf627bb1fd41ba6062a3596e52bd8f72,79ba576b22e3c9d0cf50f74f9e1197f95759b17a..3f2f7bf6e33526edeaa3a74288d0b14b79013aaf
--- 1/drivers/firmware/efi/unaccepted_memory.c
--- 2/drivers/firmware/efi/unaccepted_memory.c
+++ b/drivers/firmware/efi/unaccepted_memory.c
@@@ -3,19 -3,12 +3,20 @@@
   #include <linux/efi.h>
   #include <linux/memblock.h>
   #include <linux/spinlock.h>
+ #include <linux/crash_dump.h>
   #include <asm/unaccepted_memory.h>
   
- -/* Protects unaccepted memory bitmap */
+ +/* Protects unaccepted memory bitmap and accepting_list */
   static DEFINE_SPINLOCK(unaccepted_memory_lock);
   
+ +struct accept_range {
+ +      struct list_head list;
+ +      unsigned long start;
+ +      unsigned long end;
+ +};
+ +
+ +static LIST_HEAD(accepting_list);
+ +
   /*
    * accept_memory() -- Consult bitmap and accept the memory if needed.
    *
diff --cc drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
Simple merge
diff --cc drivers/gpu/drm/i915/i915_drv.h
Simple merge
diff --cc drivers/gpu/drm/msm/msm_drv.c

index 443bbc3ed75089110ef9f17f014e99be5209ff60,7f20249d60715b1676e885aa485c94f024fec7d8..2aae7d107f3356e08b55b6b05bf7cf96205318a0
--- 1/drivers/gpu/drm/msm/msm_drv.c
--- 2/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@@ -265,12 -457,23 +265,14 @@@ static int msm_drm_init(struct device *
         if (ret)
                 goto err_deinit_vram;
   
-       msm_gem_shrinker_init(ddev);
- -      /* the fw fb could be anywhere in memory */
- -      ret = drm_aperture_remove_framebuffers(drv);
- -      if (ret)
- -              goto err_msm_uninit;
- -
+       ret = msm_gem_shrinker_init(ddev);
+       if (ret)
+               goto err_msm_uninit;
   
         if (priv->kms_init) {
- -              ret = priv->kms_init(ddev);
- -              if (ret) {
- -                      DRM_DEV_ERROR(dev, "failed to load kms\n");
- -                      priv->kms = NULL;
+ +              ret = msm_drm_kms_init(dev, drv);
+ +              if (ret)
                         goto err_msm_uninit;
- -              }
- -              kms = priv->kms;
         } else {
                 /* valid only for the dummy headless case, where of_node=NULL */
                 WARN_ON(dev->of_node);
diff --cc drivers/gpu/drm/msm/msm_drv.h
Simple merge
diff --cc drivers/gpu/drm/panfrost/panfrost_device.h

index 1e85656dc2f7fe71e57d35ab64b672aa3ebd6883,e667e56893536709bea72867f135a5c1fdd735dd..1ef38f60d5dc4e96f2878d0e6b0ad4c16d1f382d
--- 1/drivers/gpu/drm/panfrost/panfrost_device.h
--- 2/drivers/gpu/drm/panfrost/panfrost_device.h
+++ b/drivers/gpu/drm/panfrost/panfrost_device.h
@@@ -119,14 -118,9 +119,14 @@@ struct panfrost_device 
   
         struct mutex shrinker_lock;
         struct list_head shrinker_list;
-       struct shrinker shrinker;
+       struct shrinker *shrinker;
   
         struct panfrost_devfreq pfdevfreq;
+ +
+ +      struct {
+ +              atomic_t use_count;
+ +              spinlock_t lock;
+ +      } cycle_counter;
   };
   
   struct panfrost_mmu {
diff --cc drivers/gpu/drm/panfrost/panfrost_drv.c
Simple merge
diff --cc drivers/gpu/drm/panfrost/panfrost_gem.h
Simple merge
diff --cc drivers/md/bcache/bcache.h
Simple merge
diff --cc drivers/md/dm-cache-metadata.c
Simple merge
diff --cc drivers/md/raid5.c
Simple merge
diff --cc drivers/virtio/virtio_balloon.c
Simple merge
diff --cc fs/bcachefs/btree_cache.c

index 82cf243aa28830ef1720c994bd7261eebbb23cba,0000000000000000000000000000000000000000..5e585819190576db1f22ec9ec19b9b3322f1caf1

mode 100644,000000..100644
--- 1/fs/bcachefs/btree_cache.c
--- /dev/null
+++ b/fs/bcachefs/btree_cache.c
@@@ -1,1202 -1,0 +1,1204 @@@
-       struct bch_fs *c = container_of(shrink, struct bch_fs,
-                                       btree_cache.shrink);
+ +// SPDX-License-Identifier: GPL-2.0
+ +
+ +#include "bcachefs.h"
+ +#include "bkey_buf.h"
+ +#include "btree_cache.h"
+ +#include "btree_io.h"
+ +#include "btree_iter.h"
+ +#include "btree_locking.h"
+ +#include "debug.h"
+ +#include "errcode.h"
+ +#include "error.h"
+ +#include "trace.h"
+ +
+ +#include <linux/prefetch.h>
+ +#include <linux/sched/mm.h>
+ +
+ +const char * const bch2_btree_node_flags[] = {
+ +#define x(f)  #f,
+ +      BTREE_FLAGS()
+ +#undef x
+ +      NULL
+ +};
+ +
+ +void bch2_recalc_btree_reserve(struct bch_fs *c)
+ +{
+ +      unsigned i, reserve = 16;
+ +
+ +      if (!c->btree_roots_known[0].b)
+ +              reserve += 8;
+ +
+ +      for (i = 0; i < btree_id_nr_alive(c); i++) {
+ +              struct btree_root *r = bch2_btree_id_root(c, i);
+ +
+ +              if (r->b)
+ +                      reserve += min_t(unsigned, 1, r->b->c.level) * 8;
+ +      }
+ +
+ +      c->btree_cache.reserve = reserve;
+ +}
+ +
+ +static inline unsigned btree_cache_can_free(struct btree_cache *bc)
+ +{
+ +      return max_t(int, 0, bc->used - bc->reserve);
+ +}
+ +
+ +static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
+ +{
+ +      if (b->c.lock.readers)
+ +              list_move(&b->list, &bc->freed_pcpu);
+ +      else
+ +              list_move(&b->list, &bc->freed_nonpcpu);
+ +}
+ +
+ +static void btree_node_data_free(struct bch_fs *c, struct btree *b)
+ +{
+ +      struct btree_cache *bc = &c->btree_cache;
+ +
+ +      EBUG_ON(btree_node_write_in_flight(b));
+ +
+ +      clear_btree_node_just_written(b);
+ +
+ +      kvpfree(b->data, btree_bytes(c));
+ +      b->data = NULL;
+ +#ifdef __KERNEL__
+ +      kvfree(b->aux_data);
+ +#else
+ +      munmap(b->aux_data, btree_aux_data_bytes(b));
+ +#endif
+ +      b->aux_data = NULL;
+ +
+ +      bc->used--;
+ +
+ +      btree_node_to_freedlist(bc, b);
+ +}
+ +
+ +static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
+ +                                 const void *obj)
+ +{
+ +      const struct btree *b = obj;
+ +      const u64 *v = arg->key;
+ +
+ +      return b->hash_val == *v ? 0 : 1;
+ +}
+ +
+ +static const struct rhashtable_params bch_btree_cache_params = {
+ +      .head_offset    = offsetof(struct btree, hash),
+ +      .key_offset     = offsetof(struct btree, hash_val),
+ +      .key_len        = sizeof(u64),
+ +      .obj_cmpfn      = bch2_btree_cache_cmp_fn,
+ +};
+ +
+ +static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
+ +{
+ +      BUG_ON(b->data || b->aux_data);
+ +
+ +      b->data = kvpmalloc(btree_bytes(c), gfp);
+ +      if (!b->data)
+ +              return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+ +#ifdef __KERNEL__
+ +      b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
+ +#else
+ +      b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
+ +                         PROT_READ|PROT_WRITE|PROT_EXEC,
+ +                         MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+ +      if (b->aux_data == MAP_FAILED)
+ +              b->aux_data = NULL;
+ +#endif
+ +      if (!b->aux_data) {
+ +              kvpfree(b->data, btree_bytes(c));
+ +              b->data = NULL;
+ +              return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
+ +{
+ +      struct btree *b;
+ +
+ +      b = kzalloc(sizeof(struct btree), gfp);
+ +      if (!b)
+ +              return NULL;
+ +
+ +      bkey_btree_ptr_init(&b->key);
+ +      INIT_LIST_HEAD(&b->list);
+ +      INIT_LIST_HEAD(&b->write_blocked);
+ +      b->byte_order = ilog2(btree_bytes(c));
+ +      return b;
+ +}
+ +
+ +struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
+ +{
+ +      struct btree_cache *bc = &c->btree_cache;
+ +      struct btree *b;
+ +
+ +      b = __btree_node_mem_alloc(c, GFP_KERNEL);
+ +      if (!b)
+ +              return NULL;
+ +
+ +      if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
+ +              kfree(b);
+ +              return NULL;
+ +      }
+ +
+ +      bch2_btree_lock_init(&b->c, 0);
+ +
+ +      bc->used++;
+ +      list_add(&b->list, &bc->freeable);
+ +      return b;
+ +}
+ +
+ +/* Btree in memory cache - hash table */
+ +
+ +void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
+ +{
+ +      int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+ +
+ +      BUG_ON(ret);
+ +
+ +      /* Cause future lookups for this node to fail: */
+ +      b->hash_val = 0;
+ +}
+ +
+ +int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
+ +{
+ +      BUG_ON(b->hash_val);
+ +      b->hash_val = btree_ptr_hash_val(&b->key);
+ +
+ +      return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
+ +                                           bch_btree_cache_params);
+ +}
+ +
+ +int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
+ +                              unsigned level, enum btree_id id)
+ +{
+ +      int ret;
+ +
+ +      b->c.level      = level;
+ +      b->c.btree_id   = id;
+ +
+ +      mutex_lock(&bc->lock);
+ +      ret = __bch2_btree_node_hash_insert(bc, b);
+ +      if (!ret)
+ +              list_add_tail(&b->list, &bc->live);
+ +      mutex_unlock(&bc->lock);
+ +
+ +      return ret;
+ +}
+ +
+ +__flatten
+ +static inline struct btree *btree_cache_find(struct btree_cache *bc,
+ +                                   const struct bkey_i *k)
+ +{
+ +      u64 v = btree_ptr_hash_val(k);
+ +
+ +      return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
+ +}
+ +
+ +/*
+ + * this version is for btree nodes that have already been freed (we're not
+ + * reaping a real btree node)
+ + */
+ +static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+ +{
+ +      struct btree_cache *bc = &c->btree_cache;
+ +      int ret = 0;
+ +
+ +      lockdep_assert_held(&bc->lock);
+ +wait_on_io:
+ +      if (b->flags & ((1U << BTREE_NODE_dirty)|
+ +                      (1U << BTREE_NODE_read_in_flight)|
+ +                      (1U << BTREE_NODE_write_in_flight))) {
+ +              if (!flush)
+ +                      return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ +
+ +              /* XXX: waiting on IO with btree cache lock held */
+ +              bch2_btree_node_wait_on_read(b);
+ +              bch2_btree_node_wait_on_write(b);
+ +      }
+ +
+ +      if (!six_trylock_intent(&b->c.lock))
+ +              return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ +
+ +      if (!six_trylock_write(&b->c.lock))
+ +              goto out_unlock_intent;
+ +
+ +      /* recheck under lock */
+ +      if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
+ +                      (1U << BTREE_NODE_write_in_flight))) {
+ +              if (!flush)
+ +                      goto out_unlock;
+ +              six_unlock_write(&b->c.lock);
+ +              six_unlock_intent(&b->c.lock);
+ +              goto wait_on_io;
+ +      }
+ +
+ +      if (btree_node_noevict(b) ||
+ +          btree_node_write_blocked(b) ||
+ +          btree_node_will_make_reachable(b))
+ +              goto out_unlock;
+ +
+ +      if (btree_node_dirty(b)) {
+ +              if (!flush)
+ +                      goto out_unlock;
+ +              /*
+ +               * Using the underscore version because we don't want to compact
+ +               * bsets after the write, since this node is about to be evicted
+ +               * - unless btree verify mode is enabled, since it runs out of
+ +               * the post write cleanup:
+ +               */
+ +              if (bch2_verify_btree_ondisk)
+ +                      bch2_btree_node_write(c, b, SIX_LOCK_intent,
+ +                                            BTREE_WRITE_cache_reclaim);
+ +              else
+ +                      __bch2_btree_node_write(c, b,
+ +                                              BTREE_WRITE_cache_reclaim);
+ +
+ +              six_unlock_write(&b->c.lock);
+ +              six_unlock_intent(&b->c.lock);
+ +              goto wait_on_io;
+ +      }
+ +out:
+ +      if (b->hash_val && !ret)
+ +              trace_and_count(c, btree_cache_reap, c, b);
+ +      return ret;
+ +out_unlock:
+ +      six_unlock_write(&b->c.lock);
+ +out_unlock_intent:
+ +      six_unlock_intent(&b->c.lock);
+ +      ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
+ +      goto out;
+ +}
+ +
+ +static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
+ +{
+ +      return __btree_node_reclaim(c, b, false);
+ +}
+ +
+ +static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
+ +{
+ +      return __btree_node_reclaim(c, b, true);
+ +}
+ +
+ +static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
+ +                                         struct shrink_control *sc)
+ +{
-       struct bch_fs *c = container_of(shrink, struct bch_fs,
-                                       btree_cache.shrink);
++      struct bch_fs *c = shrink->private_data;
+ +      struct btree_cache *bc = &c->btree_cache;
+ +      struct btree *b, *t;
+ +      unsigned long nr = sc->nr_to_scan;
+ +      unsigned long can_free = 0;
+ +      unsigned long freed = 0;
+ +      unsigned long touched = 0;
+ +      unsigned i, flags;
+ +      unsigned long ret = SHRINK_STOP;
+ +      bool trigger_writes = atomic_read(&bc->dirty) + nr >=
+ +              bc->used * 3 / 4;
+ +
+ +      if (bch2_btree_shrinker_disabled)
+ +              return SHRINK_STOP;
+ +
+ +      mutex_lock(&bc->lock);
+ +      flags = memalloc_nofs_save();
+ +
+ +      /*
+ +       * It's _really_ critical that we don't free too many btree nodes - we
+ +       * have to always leave ourselves a reserve. The reserve is how we
+ +       * guarantee that allocating memory for a new btree node can always
+ +       * succeed, so that inserting keys into the btree can always succeed and
+ +       * IO can always make forward progress:
+ +       */
+ +      can_free = btree_cache_can_free(bc);
+ +      nr = min_t(unsigned long, nr, can_free);
+ +
+ +      i = 0;
+ +      list_for_each_entry_safe(b, t, &bc->freeable, list) {
+ +              /*
+ +               * Leave a few nodes on the freeable list, so that a btree split
+ +               * won't have to hit the system allocator:
+ +               */
+ +              if (++i <= 3)
+ +                      continue;
+ +
+ +              touched++;
+ +
+ +              if (touched >= nr)
+ +                      goto out;
+ +
+ +              if (!btree_node_reclaim(c, b)) {
+ +                      btree_node_data_free(c, b);
+ +                      six_unlock_write(&b->c.lock);
+ +                      six_unlock_intent(&b->c.lock);
+ +                      freed++;
+ +              }
+ +      }
+ +restart:
+ +      list_for_each_entry_safe(b, t, &bc->live, list) {
+ +              touched++;
+ +
+ +              if (btree_node_accessed(b)) {
+ +                      clear_btree_node_accessed(b);
+ +              } else if (!btree_node_reclaim(c, b)) {
+ +                      freed++;
+ +                      btree_node_data_free(c, b);
+ +
+ +                      bch2_btree_node_hash_remove(bc, b);
+ +                      six_unlock_write(&b->c.lock);
+ +                      six_unlock_intent(&b->c.lock);
+ +
+ +                      if (freed == nr)
+ +                              goto out_rotate;
+ +              } else if (trigger_writes &&
+ +                         btree_node_dirty(b) &&
+ +                         !btree_node_will_make_reachable(b) &&
+ +                         !btree_node_write_blocked(b) &&
+ +                         six_trylock_read(&b->c.lock)) {
+ +                      list_move(&bc->live, &b->list);
+ +                      mutex_unlock(&bc->lock);
+ +                      __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
+ +                      six_unlock_read(&b->c.lock);
+ +                      if (touched >= nr)
+ +                              goto out_nounlock;
+ +                      mutex_lock(&bc->lock);
+ +                      goto restart;
+ +              }
+ +
+ +              if (touched >= nr)
+ +                      break;
+ +      }
+ +out_rotate:
+ +      if (&t->list != &bc->live)
+ +              list_move_tail(&bc->live, &t->list);
+ +out:
+ +      mutex_unlock(&bc->lock);
+ +out_nounlock:
+ +      ret = freed;
+ +      memalloc_nofs_restore(flags);
+ +      trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
+ +      return ret;
+ +}
+ +
+ +static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
+ +                                          struct shrink_control *sc)
+ +{
-       unregister_shrinker(&bc->shrink);
++      struct bch_fs *c = shrink->private_data;
+ +      struct btree_cache *bc = &c->btree_cache;
+ +
+ +      if (bch2_btree_shrinker_disabled)
+ +              return 0;
+ +
+ +      return btree_cache_can_free(bc);
+ +}
+ +
+ +void bch2_fs_btree_cache_exit(struct bch_fs *c)
+ +{
+ +      struct btree_cache *bc = &c->btree_cache;
+ +      struct btree *b;
+ +      unsigned i, flags;
+ +
-       bc->shrink.count_objects        = bch2_btree_cache_count;
-       bc->shrink.scan_objects         = bch2_btree_cache_scan;
-       bc->shrink.seeks                = 4;
-       ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
-       if (ret)
++      shrinker_free(bc->shrink);
+ +
+ +      /* vfree() can allocate memory: */
+ +      flags = memalloc_nofs_save();
+ +      mutex_lock(&bc->lock);
+ +
+ +      if (c->verify_data)
+ +              list_move(&c->verify_data->list, &bc->live);
+ +
+ +      kvpfree(c->verify_ondisk, btree_bytes(c));
+ +
+ +      for (i = 0; i < btree_id_nr_alive(c); i++) {
+ +              struct btree_root *r = bch2_btree_id_root(c, i);
+ +
+ +              if (r->b)
+ +                      list_add(&r->b->list, &bc->live);
+ +      }
+ +
+ +      list_splice(&bc->freeable, &bc->live);
+ +
+ +      while (!list_empty(&bc->live)) {
+ +              b = list_first_entry(&bc->live, struct btree, list);
+ +
+ +              BUG_ON(btree_node_read_in_flight(b) ||
+ +                     btree_node_write_in_flight(b));
+ +
+ +              if (btree_node_dirty(b))
+ +                      bch2_btree_complete_write(c, b, btree_current_write(b));
+ +              clear_btree_node_dirty_acct(c, b);
+ +
+ +              btree_node_data_free(c, b);
+ +      }
+ +
+ +      BUG_ON(atomic_read(&c->btree_cache.dirty));
+ +
+ +      list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+ +
+ +      while (!list_empty(&bc->freed_nonpcpu)) {
+ +              b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
+ +              list_del(&b->list);
+ +              six_lock_exit(&b->c.lock);
+ +              kfree(b);
+ +      }
+ +
+ +      mutex_unlock(&bc->lock);
+ +      memalloc_nofs_restore(flags);
+ +
+ +      if (bc->table_init_done)
+ +              rhashtable_destroy(&bc->table);
+ +}
+ +
+ +int bch2_fs_btree_cache_init(struct bch_fs *c)
+ +{
+ +      struct btree_cache *bc = &c->btree_cache;
++      struct shrinker *shrink;
+ +      unsigned i;
+ +      int ret = 0;
+ +
+ +      ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
+ +      if (ret)
+ +              goto err;
+ +
+ +      bc->table_init_done = true;
+ +
+ +      bch2_recalc_btree_reserve(c);
+ +
+ +      for (i = 0; i < bc->reserve; i++)
+ +              if (!__bch2_btree_node_mem_alloc(c))
+ +                      goto err;
+ +
+ +      list_splice_init(&bc->live, &bc->freeable);
+ +
+ +      mutex_init(&c->verify_lock);
+ +
++      shrink = shrinker_alloc(0, "%s/btree_cache", c->name);
++      if (!shrink)
+ +              goto err;
++      bc->shrink = shrink;
++      shrink->count_objects   = bch2_btree_cache_count;
++      shrink->scan_objects    = bch2_btree_cache_scan;
++      shrink->seeks           = 4;
++      shrink->private_data    = c;
++      shrinker_register(shrink);
+ +
+ +      return 0;
+ +err:
+ +      return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ +}
+ +
+ +void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
+ +{
+ +      mutex_init(&bc->lock);
+ +      INIT_LIST_HEAD(&bc->live);
+ +      INIT_LIST_HEAD(&bc->freeable);
+ +      INIT_LIST_HEAD(&bc->freed_pcpu);
+ +      INIT_LIST_HEAD(&bc->freed_nonpcpu);
+ +}
+ +
+ +/*
+ + * We can only have one thread cannibalizing other cached btree nodes at a time,
+ + * or we'll deadlock. We use an open coded mutex to ensure that, which a
+ + * cannibalize_bucket() will take. This means every time we unlock the root of
+ + * the btree, we need to release this lock if we have it held.
+ + */
+ +void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
+ +{
+ +      struct btree_cache *bc = &c->btree_cache;
+ +
+ +      if (bc->alloc_lock == current) {
+ +              trace_and_count(c, btree_cache_cannibalize_unlock, c);
+ +              bc->alloc_lock = NULL;
+ +              closure_wake_up(&bc->alloc_wait);
+ +      }
+ +}
+ +
+ +int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
+ +{
+ +      struct btree_cache *bc = &c->btree_cache;
+ +      struct task_struct *old;
+ +
+ +      old = cmpxchg(&bc->alloc_lock, NULL, current);
+ +      if (old == NULL || old == current)
+ +              goto success;
+ +
+ +      if (!cl) {
+ +              trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+ +              return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
+ +      }
+ +
+ +      closure_wait(&bc->alloc_wait, cl);
+ +
+ +      /* Try again, after adding ourselves to waitlist */
+ +      old = cmpxchg(&bc->alloc_lock, NULL, current);
+ +      if (old == NULL || old == current) {
+ +              /* We raced */
+ +              closure_wake_up(&bc->alloc_wait);
+ +              goto success;
+ +      }
+ +
+ +      trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+ +      return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
+ +
+ +success:
+ +      trace_and_count(c, btree_cache_cannibalize_lock, c);
+ +      return 0;
+ +}
+ +
+ +static struct btree *btree_node_cannibalize(struct bch_fs *c)
+ +{
+ +      struct btree_cache *bc = &c->btree_cache;
+ +      struct btree *b;
+ +
+ +      list_for_each_entry_reverse(b, &bc->live, list)
+ +              if (!btree_node_reclaim(c, b))
+ +                      return b;
+ +
+ +      while (1) {
+ +              list_for_each_entry_reverse(b, &bc->live, list)
+ +                      if (!btree_node_write_and_reclaim(c, b))
+ +                              return b;
+ +
+ +              /*
+ +               * Rare case: all nodes were intent-locked.
+ +               * Just busy-wait.
+ +               */
+ +              WARN_ONCE(1, "btree cache cannibalize failed\n");
+ +              cond_resched();
+ +      }
+ +}
+ +
+ +struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
+ +{
+ +      struct bch_fs *c = trans->c;
+ +      struct btree_cache *bc = &c->btree_cache;
+ +      struct list_head *freed = pcpu_read_locks
+ +              ? &bc->freed_pcpu
+ +              : &bc->freed_nonpcpu;
+ +      struct btree *b, *b2;
+ +      u64 start_time = local_clock();
+ +      unsigned flags;
+ +
+ +      flags = memalloc_nofs_save();
+ +      mutex_lock(&bc->lock);
+ +
+ +      /*
+ +       * We never free struct btree itself, just the memory that holds the on
+ +       * disk node. Check the freed list before allocating a new one:
+ +       */
+ +      list_for_each_entry(b, freed, list)
+ +              if (!btree_node_reclaim(c, b)) {
+ +                      list_del_init(&b->list);
+ +                      goto got_node;
+ +              }
+ +
+ +      b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
+ +      if (!b) {
+ +              mutex_unlock(&bc->lock);
+ +              bch2_trans_unlock(trans);
+ +              b = __btree_node_mem_alloc(c, GFP_KERNEL);
+ +              if (!b)
+ +                      goto err;
+ +              mutex_lock(&bc->lock);
+ +      }
+ +
+ +      bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
+ +
+ +      BUG_ON(!six_trylock_intent(&b->c.lock));
+ +      BUG_ON(!six_trylock_write(&b->c.lock));
+ +got_node:
+ +
+ +      /*
+ +       * btree_free() doesn't free memory; it sticks the node on the end of
+ +       * the list. Check if there's any freed nodes there:
+ +       */
+ +      list_for_each_entry(b2, &bc->freeable, list)
+ +              if (!btree_node_reclaim(c, b2)) {
+ +                      swap(b->data, b2->data);
+ +                      swap(b->aux_data, b2->aux_data);
+ +                      btree_node_to_freedlist(bc, b2);
+ +                      six_unlock_write(&b2->c.lock);
+ +                      six_unlock_intent(&b2->c.lock);
+ +                      goto got_mem;
+ +              }
+ +
+ +      mutex_unlock(&bc->lock);
+ +
+ +      if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
+ +              bch2_trans_unlock(trans);
+ +              if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
+ +                      goto err;
+ +      }
+ +
+ +      mutex_lock(&bc->lock);
+ +      bc->used++;
+ +got_mem:
+ +      mutex_unlock(&bc->lock);
+ +
+ +      BUG_ON(btree_node_hashed(b));
+ +      BUG_ON(btree_node_dirty(b));
+ +      BUG_ON(btree_node_write_in_flight(b));
+ +out:
+ +      b->flags                = 0;
+ +      b->written              = 0;
+ +      b->nsets                = 0;
+ +      b->sib_u64s[0]          = 0;
+ +      b->sib_u64s[1]          = 0;
+ +      b->whiteout_u64s        = 0;
+ +      bch2_btree_keys_init(b);
+ +      set_btree_node_accessed(b);
+ +
+ +      bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
+ +                             start_time);
+ +
+ +      memalloc_nofs_restore(flags);
+ +      return b;
+ +err:
+ +      mutex_lock(&bc->lock);
+ +
+ +      /* Try to cannibalize another cached btree node: */
+ +      if (bc->alloc_lock == current) {
+ +              b2 = btree_node_cannibalize(c);
+ +              clear_btree_node_just_written(b2);
+ +              bch2_btree_node_hash_remove(bc, b2);
+ +
+ +              if (b) {
+ +                      swap(b->data, b2->data);
+ +                      swap(b->aux_data, b2->aux_data);
+ +                      btree_node_to_freedlist(bc, b2);
+ +                      six_unlock_write(&b2->c.lock);
+ +                      six_unlock_intent(&b2->c.lock);
+ +              } else {
+ +                      b = b2;
+ +                      list_del_init(&b->list);
+ +              }
+ +
+ +              mutex_unlock(&bc->lock);
+ +
+ +              trace_and_count(c, btree_cache_cannibalize, c);
+ +              goto out;
+ +      }
+ +
+ +      mutex_unlock(&bc->lock);
+ +      memalloc_nofs_restore(flags);
+ +      return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
+ +}
+ +
+ +/* Slowpath, don't want it inlined into btree_iter_traverse() */
+ +static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
+ +                              struct btree_path *path,
+ +                              const struct bkey_i *k,
+ +                              enum btree_id btree_id,
+ +                              unsigned level,
+ +                              enum six_lock_type lock_type,
+ +                              bool sync)
+ +{
+ +      struct bch_fs *c = trans->c;
+ +      struct btree_cache *bc = &c->btree_cache;
+ +      struct btree *b;
+ +      u32 seq;
+ +
+ +      BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
+ +      /*
+ +       * Parent node must be locked, else we could read in a btree node that's
+ +       * been freed:
+ +       */
+ +      if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
+ +              trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
+ +              return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
+ +      }
+ +
+ +      b = bch2_btree_node_mem_alloc(trans, level != 0);
+ +
+ +      if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
+ +              trans->memory_allocation_failure = true;
+ +              trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
+ +              return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
+ +      }
+ +
+ +      if (IS_ERR(b))
+ +              return b;
+ +
+ +      /*
+ +       * Btree nodes read in from disk should not have the accessed bit set
+ +       * initially, so that linear scans don't thrash the cache:
+ +       */
+ +      clear_btree_node_accessed(b);
+ +
+ +      bkey_copy(&b->key, k);
+ +      if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
+ +              /* raced with another fill: */
+ +
+ +              /* mark as unhashed... */
+ +              b->hash_val = 0;
+ +
+ +              mutex_lock(&bc->lock);
+ +              list_add(&b->list, &bc->freeable);
+ +              mutex_unlock(&bc->lock);
+ +
+ +              six_unlock_write(&b->c.lock);
+ +              six_unlock_intent(&b->c.lock);
+ +              return NULL;
+ +      }
+ +
+ +      set_btree_node_read_in_flight(b);
+ +
+ +      six_unlock_write(&b->c.lock);
+ +      seq = six_lock_seq(&b->c.lock);
+ +      six_unlock_intent(&b->c.lock);
+ +
+ +      /* Unlock before doing IO: */
+ +      if (path && sync)
+ +              bch2_trans_unlock_noassert(trans);
+ +
+ +      bch2_btree_node_read(c, b, sync);
+ +
+ +      if (!sync)
+ +              return NULL;
+ +
+ +      if (path) {
+ +              int ret = bch2_trans_relock(trans) ?:
+ +                      bch2_btree_path_relock_intent(trans, path);
+ +              if (ret) {
+ +                      BUG_ON(!trans->restarted);
+ +                      return ERR_PTR(ret);
+ +              }
+ +      }
+ +
+ +      if (!six_relock_type(&b->c.lock, lock_type, seq)) {
+ +              if (path)
+ +                      trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
+ +              return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
+ +      }
+ +
+ +      return b;
+ +}
+ +
+ +static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
+ +{
+ +      struct printbuf buf = PRINTBUF;
+ +
+ +      if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
+ +              return;
+ +
+ +      prt_printf(&buf,
+ +             "btree node header doesn't match ptr\n"
+ +             "btree %s level %u\n"
+ +             "ptr: ",
+ +             bch2_btree_ids[b->c.btree_id], b->c.level);
+ +      bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ +
+ +      prt_printf(&buf, "\nheader: btree %s level %llu\n"
+ +             "min ",
+ +             bch2_btree_ids[BTREE_NODE_ID(b->data)],
+ +             BTREE_NODE_LEVEL(b->data));
+ +      bch2_bpos_to_text(&buf, b->data->min_key);
+ +
+ +      prt_printf(&buf, "\nmax ");
+ +      bch2_bpos_to_text(&buf, b->data->max_key);
+ +
+ +      bch2_fs_inconsistent(c, "%s", buf.buf);
+ +      printbuf_exit(&buf);
+ +}
+ +
+ +static inline void btree_check_header(struct bch_fs *c, struct btree *b)
+ +{
+ +      if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
+ +          b->c.level != BTREE_NODE_LEVEL(b->data) ||
+ +          !bpos_eq(b->data->max_key, b->key.k.p) ||
+ +          (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ +           !bpos_eq(b->data->min_key,
+ +                    bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
+ +              btree_bad_header(c, b);
+ +}
+ +
+ +static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
+ +                                         const struct bkey_i *k, unsigned level,
+ +                                         enum six_lock_type lock_type,
+ +                                         unsigned long trace_ip)
+ +{
+ +      struct bch_fs *c = trans->c;
+ +      struct btree_cache *bc = &c->btree_cache;
+ +      struct btree *b;
+ +      struct bset_tree *t;
+ +      bool need_relock = false;
+ +      int ret;
+ +
+ +      EBUG_ON(level >= BTREE_MAX_DEPTH);
+ +retry:
+ +      b = btree_cache_find(bc, k);
+ +      if (unlikely(!b)) {
+ +              /*
+ +               * We must have the parent locked to call bch2_btree_node_fill(),
+ +               * else we could read in a btree node from disk that's been
+ +               * freed:
+ +               */
+ +              b = bch2_btree_node_fill(trans, path, k, path->btree_id,
+ +                                       level, lock_type, true);
+ +              need_relock = true;
+ +
+ +              /* We raced and found the btree node in the cache */
+ +              if (!b)
+ +                      goto retry;
+ +
+ +              if (IS_ERR(b))
+ +                      return b;
+ +      } else {
+ +              if (btree_node_read_locked(path, level + 1))
+ +                      btree_node_unlock(trans, path, level + 1);
+ +
+ +              ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
+ +              if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ +                      return ERR_PTR(ret);
+ +
+ +              BUG_ON(ret);
+ +
+ +              if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+ +                           b->c.level != level ||
+ +                           race_fault())) {
+ +                      six_unlock_type(&b->c.lock, lock_type);
+ +                      if (bch2_btree_node_relock(trans, path, level + 1))
+ +                              goto retry;
+ +
+ +                      trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
+ +                      return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
+ +              }
+ +
+ +              /* avoid atomic set bit if it's not needed: */
+ +              if (!btree_node_accessed(b))
+ +                      set_btree_node_accessed(b);
+ +      }
+ +
+ +      if (unlikely(btree_node_read_in_flight(b))) {
+ +              u32 seq = six_lock_seq(&b->c.lock);
+ +
+ +              six_unlock_type(&b->c.lock, lock_type);
+ +              bch2_trans_unlock(trans);
+ +              need_relock = true;
+ +
+ +              bch2_btree_node_wait_on_read(b);
+ +
+ +              /*
+ +               * should_be_locked is not set on this path yet, so we need to
+ +               * relock it specifically:
+ +               */
+ +              if (!six_relock_type(&b->c.lock, lock_type, seq))
+ +                      goto retry;
+ +      }
+ +
+ +      if (unlikely(need_relock)) {
+ +              ret = bch2_trans_relock(trans) ?:
+ +                      bch2_btree_path_relock_intent(trans, path);
+ +              if (ret) {
+ +                      six_unlock_type(&b->c.lock, lock_type);
+ +                      return ERR_PTR(ret);
+ +              }
+ +      }
+ +
+ +      prefetch(b->aux_data);
+ +
+ +      for_each_bset(b, t) {
+ +              void *p = (u64 *) b->aux_data + t->aux_data_offset;
+ +
+ +              prefetch(p + L1_CACHE_BYTES * 0);
+ +              prefetch(p + L1_CACHE_BYTES * 1);
+ +              prefetch(p + L1_CACHE_BYTES * 2);
+ +      }
+ +
+ +      if (unlikely(btree_node_read_error(b))) {
+ +              six_unlock_type(&b->c.lock, lock_type);
+ +              return ERR_PTR(-EIO);
+ +      }
+ +
+ +      EBUG_ON(b->c.btree_id != path->btree_id);
+ +      EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+ +      btree_check_header(c, b);
+ +
+ +      return b;
+ +}
+ +
+ +/**
+ + * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
+ + * in from disk if necessary.
+ + *
+ + * @trans:    btree transaction object
+ + * @path:     btree_path being traversed
+ + * @k:                pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
+ + * @level:    level of btree node being looked up (0 == leaf node)
+ + * @lock_type:        SIX_LOCK_read or SIX_LOCK_intent
+ + * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
+ + *
+ + * The btree node will have either a read or a write lock held, depending on
+ + * the @write parameter.
+ + *
+ + * Returns: btree node or ERR_PTR()
+ + */
+ +struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
+ +                                const struct bkey_i *k, unsigned level,
+ +                                enum six_lock_type lock_type,
+ +                                unsigned long trace_ip)
+ +{
+ +      struct bch_fs *c = trans->c;
+ +      struct btree *b;
+ +      struct bset_tree *t;
+ +      int ret;
+ +
+ +      EBUG_ON(level >= BTREE_MAX_DEPTH);
+ +
+ +      b = btree_node_mem_ptr(k);
+ +
+ +      /*
+ +       * Check b->hash_val _before_ calling btree_node_lock() - this might not
+ +       * be the node we want anymore, and trying to lock the wrong node could
+ +       * cause an unneccessary transaction restart:
+ +       */
+ +      if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
+ +                   !b ||
+ +                   b->hash_val != btree_ptr_hash_val(k)))
+ +              return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+ +
+ +      if (btree_node_read_locked(path, level + 1))
+ +              btree_node_unlock(trans, path, level + 1);
+ +
+ +      ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
+ +      if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ +              return ERR_PTR(ret);
+ +
+ +      BUG_ON(ret);
+ +
+ +      if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+ +                   b->c.level != level ||
+ +                   race_fault())) {
+ +              six_unlock_type(&b->c.lock, lock_type);
+ +              if (bch2_btree_node_relock(trans, path, level + 1))
+ +                      return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+ +
+ +              trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
+ +              return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
+ +      }
+ +
+ +      if (unlikely(btree_node_read_in_flight(b))) {
+ +              six_unlock_type(&b->c.lock, lock_type);
+ +              return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+ +      }
+ +
+ +      prefetch(b->aux_data);
+ +
+ +      for_each_bset(b, t) {
+ +              void *p = (u64 *) b->aux_data + t->aux_data_offset;
+ +
+ +              prefetch(p + L1_CACHE_BYTES * 0);
+ +              prefetch(p + L1_CACHE_BYTES * 1);
+ +              prefetch(p + L1_CACHE_BYTES * 2);
+ +      }
+ +
+ +      /* avoid atomic set bit if it's not needed: */
+ +      if (!btree_node_accessed(b))
+ +              set_btree_node_accessed(b);
+ +
+ +      if (unlikely(btree_node_read_error(b))) {
+ +              six_unlock_type(&b->c.lock, lock_type);
+ +              return ERR_PTR(-EIO);
+ +      }
+ +
+ +      EBUG_ON(b->c.btree_id != path->btree_id);
+ +      EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+ +      btree_check_header(c, b);
+ +
+ +      return b;
+ +}
+ +
+ +struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
+ +                                       const struct bkey_i *k,
+ +                                       enum btree_id btree_id,
+ +                                       unsigned level,
+ +                                       bool nofill)
+ +{
+ +      struct bch_fs *c = trans->c;
+ +      struct btree_cache *bc = &c->btree_cache;
+ +      struct btree *b;
+ +      struct bset_tree *t;
+ +      int ret;
+ +
+ +      EBUG_ON(level >= BTREE_MAX_DEPTH);
+ +
+ +      if (c->opts.btree_node_mem_ptr_optimization) {
+ +              b = btree_node_mem_ptr(k);
+ +              if (b)
+ +                      goto lock_node;
+ +      }
+ +retry:
+ +      b = btree_cache_find(bc, k);
+ +      if (unlikely(!b)) {
+ +              if (nofill)
+ +                      goto out;
+ +
+ +              b = bch2_btree_node_fill(trans, NULL, k, btree_id,
+ +                                       level, SIX_LOCK_read, true);
+ +
+ +              /* We raced and found the btree node in the cache */
+ +              if (!b)
+ +                      goto retry;
+ +
+ +              if (IS_ERR(b) &&
+ +                  !bch2_btree_cache_cannibalize_lock(c, NULL))
+ +                      goto retry;
+ +
+ +              if (IS_ERR(b))
+ +                      goto out;
+ +      } else {
+ +lock_node:
+ +              ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
+ +              if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ +                      return ERR_PTR(ret);
+ +
+ +              BUG_ON(ret);
+ +
+ +              if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+ +                           b->c.btree_id != btree_id ||
+ +                           b->c.level != level)) {
+ +                      six_unlock_read(&b->c.lock);
+ +                      goto retry;
+ +              }
+ +      }
+ +
+ +      /* XXX: waiting on IO with btree locks held: */
+ +      __bch2_btree_node_wait_on_read(b);
+ +
+ +      prefetch(b->aux_data);
+ +
+ +      for_each_bset(b, t) {
+ +              void *p = (u64 *) b->aux_data + t->aux_data_offset;
+ +
+ +              prefetch(p + L1_CACHE_BYTES * 0);
+ +              prefetch(p + L1_CACHE_BYTES * 1);
+ +              prefetch(p + L1_CACHE_BYTES * 2);
+ +      }
+ +
+ +      /* avoid atomic set bit if it's not needed: */
+ +      if (!btree_node_accessed(b))
+ +              set_btree_node_accessed(b);
+ +
+ +      if (unlikely(btree_node_read_error(b))) {
+ +              six_unlock_read(&b->c.lock);
+ +              b = ERR_PTR(-EIO);
+ +              goto out;
+ +      }
+ +
+ +      EBUG_ON(b->c.btree_id != btree_id);
+ +      EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+ +      btree_check_header(c, b);
+ +out:
+ +      bch2_btree_cache_cannibalize_unlock(c);
+ +      return b;
+ +}
+ +
+ +int bch2_btree_node_prefetch(struct btree_trans *trans,
+ +                           struct btree_path *path,
+ +                           const struct bkey_i *k,
+ +                           enum btree_id btree_id, unsigned level)
+ +{
+ +      struct bch_fs *c = trans->c;
+ +      struct btree_cache *bc = &c->btree_cache;
+ +      struct btree *b;
+ +
+ +      BUG_ON(trans && !btree_node_locked(path, level + 1));
+ +      BUG_ON(level >= BTREE_MAX_DEPTH);
+ +
+ +      b = btree_cache_find(bc, k);
+ +      if (b)
+ +              return 0;
+ +
+ +      b = bch2_btree_node_fill(trans, path, k, btree_id,
+ +                               level, SIX_LOCK_read, false);
+ +      return PTR_ERR_OR_ZERO(b);
+ +}
+ +
+ +void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
+ +{
+ +      struct bch_fs *c = trans->c;
+ +      struct btree_cache *bc = &c->btree_cache;
+ +      struct btree *b;
+ +
+ +      b = btree_cache_find(bc, k);
+ +      if (!b)
+ +              return;
+ +wait_on_io:
+ +      /* not allowed to wait on io with btree locks held: */
+ +
+ +      /* XXX we're called from btree_gc which will be holding other btree
+ +       * nodes locked
+ +       */
+ +      __bch2_btree_node_wait_on_read(b);
+ +      __bch2_btree_node_wait_on_write(b);
+ +
+ +      btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+ +      btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+ +
+ +      if (btree_node_dirty(b)) {
+ +              __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
+ +              six_unlock_write(&b->c.lock);
+ +              six_unlock_intent(&b->c.lock);
+ +              goto wait_on_io;
+ +      }
+ +
+ +      BUG_ON(btree_node_dirty(b));
+ +
+ +      mutex_lock(&bc->lock);
+ +      btree_node_data_free(c, b);
+ +      bch2_btree_node_hash_remove(bc, b);
+ +      mutex_unlock(&bc->lock);
+ +
+ +      six_unlock_write(&b->c.lock);
+ +      six_unlock_intent(&b->c.lock);
+ +}
+ +
+ +void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
+ +                           const struct btree *b)
+ +{
+ +      struct bset_stats stats;
+ +
+ +      memset(&stats, 0, sizeof(stats));
+ +
+ +      bch2_btree_keys_stats(b, &stats);
+ +
+ +      prt_printf(out, "l %u ", b->c.level);
+ +      bch2_bpos_to_text(out, b->data->min_key);
+ +      prt_printf(out, " - ");
+ +      bch2_bpos_to_text(out, b->data->max_key);
+ +      prt_printf(out, ":\n"
+ +             "    ptrs: ");
+ +      bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+ +      prt_newline(out);
+ +
+ +      prt_printf(out,
+ +             "    format: ");
+ +      bch2_bkey_format_to_text(out, &b->format);
+ +
+ +      prt_printf(out,
+ +             "    unpack fn len: %u\n"
+ +             "    bytes used %zu/%zu (%zu%% full)\n"
+ +             "    sib u64s: %u, %u (merge threshold %u)\n"
+ +             "    nr packed keys %u\n"
+ +             "    nr unpacked keys %u\n"
+ +             "    floats %zu\n"
+ +             "    failed unpacked %zu\n",
+ +             b->unpack_fn_len,
+ +             b->nr.live_u64s * sizeof(u64),
+ +             btree_bytes(c) - sizeof(struct btree_node),
+ +             b->nr.live_u64s * 100 / btree_max_u64s(c),
+ +             b->sib_u64s[0],
+ +             b->sib_u64s[1],
+ +             c->btree_foreground_merge_threshold,
+ +             b->nr.packed_keys,
+ +             b->nr.unpacked_keys,
+ +             stats.floats,
+ +             stats.failed);
+ +}
+ +
+ +void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c)
+ +{
+ +      prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
+ +      prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
+ +      prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
+ +}
diff --cc fs/bcachefs/btree_key_cache.c

index 29a0b566a4fe9ebdfd07666249c0eed23d427bea,0000000000000000000000000000000000000000..f9a5e38a085bbfb280fbe439ca2a6b1f0ba2f1af

mode 100644,000000..100644
--- 1/fs/bcachefs/btree_key_cache.c
--- /dev/null
+++ b/fs/bcachefs/btree_key_cache.c
@@@ -1,1072 -1,0 +1,1075 @@@
-       struct bch_fs *c = container_of(shrink, struct bch_fs,
-                                       btree_key_cache.shrink);
+ +// SPDX-License-Identifier: GPL-2.0
+ +
+ +#include "bcachefs.h"
+ +#include "btree_cache.h"
+ +#include "btree_iter.h"
+ +#include "btree_key_cache.h"
+ +#include "btree_locking.h"
+ +#include "btree_update.h"
+ +#include "errcode.h"
+ +#include "error.h"
+ +#include "journal.h"
+ +#include "journal_reclaim.h"
+ +#include "trace.h"
+ +
+ +#include <linux/sched/mm.h>
+ +
+ +static inline bool btree_uses_pcpu_readers(enum btree_id id)
+ +{
+ +      return id == BTREE_ID_subvolumes;
+ +}
+ +
+ +static struct kmem_cache *bch2_key_cache;
+ +
+ +static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
+ +                                     const void *obj)
+ +{
+ +      const struct bkey_cached *ck = obj;
+ +      const struct bkey_cached_key *key = arg->key;
+ +
+ +      return ck->key.btree_id != key->btree_id ||
+ +              !bpos_eq(ck->key.pos, key->pos);
+ +}
+ +
+ +static const struct rhashtable_params bch2_btree_key_cache_params = {
+ +      .head_offset    = offsetof(struct bkey_cached, hash),
+ +      .key_offset     = offsetof(struct bkey_cached, key),
+ +      .key_len        = sizeof(struct bkey_cached_key),
+ +      .obj_cmpfn      = bch2_btree_key_cache_cmp_fn,
+ +};
+ +
+ +__flatten
+ +inline struct bkey_cached *
+ +bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
+ +{
+ +      struct bkey_cached_key key = {
+ +              .btree_id       = btree_id,
+ +              .pos            = pos,
+ +      };
+ +
+ +      return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
+ +                                    bch2_btree_key_cache_params);
+ +}
+ +
+ +static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
+ +{
+ +      if (!six_trylock_intent(&ck->c.lock))
+ +              return false;
+ +
+ +      if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ +              six_unlock_intent(&ck->c.lock);
+ +              return false;
+ +      }
+ +
+ +      if (!six_trylock_write(&ck->c.lock)) {
+ +              six_unlock_intent(&ck->c.lock);
+ +              return false;
+ +      }
+ +
+ +      return true;
+ +}
+ +
+ +static void bkey_cached_evict(struct btree_key_cache *c,
+ +                            struct bkey_cached *ck)
+ +{
+ +      BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
+ +                                    bch2_btree_key_cache_params));
+ +      memset(&ck->key, ~0, sizeof(ck->key));
+ +
+ +      atomic_long_dec(&c->nr_keys);
+ +}
+ +
+ +static void bkey_cached_free(struct btree_key_cache *bc,
+ +                           struct bkey_cached *ck)
+ +{
+ +      struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+ +
+ +      BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+ +
+ +      ck->btree_trans_barrier_seq =
+ +              start_poll_synchronize_srcu(&c->btree_trans_barrier);
+ +
+ +      if (ck->c.lock.readers)
+ +              list_move_tail(&ck->list, &bc->freed_pcpu);
+ +      else
+ +              list_move_tail(&ck->list, &bc->freed_nonpcpu);
+ +      atomic_long_inc(&bc->nr_freed);
+ +
+ +      kfree(ck->k);
+ +      ck->k           = NULL;
+ +      ck->u64s        = 0;
+ +
+ +      six_unlock_write(&ck->c.lock);
+ +      six_unlock_intent(&ck->c.lock);
+ +}
+ +
+ +#ifdef __KERNEL__
+ +static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
+ +                                                 struct bkey_cached *ck)
+ +{
+ +      struct bkey_cached *pos;
+ +
+ +      list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
+ +              if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
+ +                               pos->btree_trans_barrier_seq)) {
+ +                      list_move(&ck->list, &pos->list);
+ +                      return;
+ +              }
+ +      }
+ +
+ +      list_move(&ck->list, &bc->freed_nonpcpu);
+ +}
+ +#endif
+ +
+ +static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
+ +                                       struct bkey_cached *ck)
+ +{
+ +      BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+ +
+ +      if (!ck->c.lock.readers) {
+ +#ifdef __KERNEL__
+ +              struct btree_key_cache_freelist *f;
+ +              bool freed = false;
+ +
+ +              preempt_disable();
+ +              f = this_cpu_ptr(bc->pcpu_freed);
+ +
+ +              if (f->nr < ARRAY_SIZE(f->objs)) {
+ +                      f->objs[f->nr++] = ck;
+ +                      freed = true;
+ +              }
+ +              preempt_enable();
+ +
+ +              if (!freed) {
+ +                      mutex_lock(&bc->lock);
+ +                      preempt_disable();
+ +                      f = this_cpu_ptr(bc->pcpu_freed);
+ +
+ +                      while (f->nr > ARRAY_SIZE(f->objs) / 2) {
+ +                              struct bkey_cached *ck2 = f->objs[--f->nr];
+ +
+ +                              __bkey_cached_move_to_freelist_ordered(bc, ck2);
+ +                      }
+ +                      preempt_enable();
+ +
+ +                      __bkey_cached_move_to_freelist_ordered(bc, ck);
+ +                      mutex_unlock(&bc->lock);
+ +              }
+ +#else
+ +              mutex_lock(&bc->lock);
+ +              list_move_tail(&ck->list, &bc->freed_nonpcpu);
+ +              mutex_unlock(&bc->lock);
+ +#endif
+ +      } else {
+ +              mutex_lock(&bc->lock);
+ +              list_move_tail(&ck->list, &bc->freed_pcpu);
+ +              mutex_unlock(&bc->lock);
+ +      }
+ +}
+ +
+ +static void bkey_cached_free_fast(struct btree_key_cache *bc,
+ +                                struct bkey_cached *ck)
+ +{
+ +      struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+ +
+ +      ck->btree_trans_barrier_seq =
+ +              start_poll_synchronize_srcu(&c->btree_trans_barrier);
+ +
+ +      list_del_init(&ck->list);
+ +      atomic_long_inc(&bc->nr_freed);
+ +
+ +      kfree(ck->k);
+ +      ck->k           = NULL;
+ +      ck->u64s        = 0;
+ +
+ +      bkey_cached_move_to_freelist(bc, ck);
+ +
+ +      six_unlock_write(&ck->c.lock);
+ +      six_unlock_intent(&ck->c.lock);
+ +}
+ +
+ +static struct bkey_cached *
+ +bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
+ +                bool *was_new)
+ +{
+ +      struct bch_fs *c = trans->c;
+ +      struct btree_key_cache *bc = &c->btree_key_cache;
+ +      struct bkey_cached *ck = NULL;
+ +      bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
+ +      int ret;
+ +
+ +      if (!pcpu_readers) {
+ +#ifdef __KERNEL__
+ +              struct btree_key_cache_freelist *f;
+ +
+ +              preempt_disable();
+ +              f = this_cpu_ptr(bc->pcpu_freed);
+ +              if (f->nr)
+ +                      ck = f->objs[--f->nr];
+ +              preempt_enable();
+ +
+ +              if (!ck) {
+ +                      mutex_lock(&bc->lock);
+ +                      preempt_disable();
+ +                      f = this_cpu_ptr(bc->pcpu_freed);
+ +
+ +                      while (!list_empty(&bc->freed_nonpcpu) &&
+ +                             f->nr < ARRAY_SIZE(f->objs) / 2) {
+ +                              ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+ +                              list_del_init(&ck->list);
+ +                              f->objs[f->nr++] = ck;
+ +                      }
+ +
+ +                      ck = f->nr ? f->objs[--f->nr] : NULL;
+ +                      preempt_enable();
+ +                      mutex_unlock(&bc->lock);
+ +              }
+ +#else
+ +              mutex_lock(&bc->lock);
+ +              if (!list_empty(&bc->freed_nonpcpu)) {
+ +                      ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+ +                      list_del_init(&ck->list);
+ +              }
+ +              mutex_unlock(&bc->lock);
+ +#endif
+ +      } else {
+ +              mutex_lock(&bc->lock);
+ +              if (!list_empty(&bc->freed_pcpu)) {
+ +                      ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
+ +                      list_del_init(&ck->list);
+ +              }
+ +              mutex_unlock(&bc->lock);
+ +      }
+ +
+ +      if (ck) {
+ +              ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
+ +              if (unlikely(ret)) {
+ +                      bkey_cached_move_to_freelist(bc, ck);
+ +                      return ERR_PTR(ret);
+ +              }
+ +
+ +              path->l[0].b = (void *) ck;
+ +              path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
+ +              mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
+ +
+ +              ret = bch2_btree_node_lock_write(trans, path, &ck->c);
+ +              if (unlikely(ret)) {
+ +                      btree_node_unlock(trans, path, 0);
+ +                      bkey_cached_move_to_freelist(bc, ck);
+ +                      return ERR_PTR(ret);
+ +              }
+ +
+ +              return ck;
+ +      }
+ +
+ +      ck = allocate_dropping_locks(trans, ret,
+ +                      kmem_cache_zalloc(bch2_key_cache, _gfp));
+ +      if (ret) {
+ +              kmem_cache_free(bch2_key_cache, ck);
+ +              return ERR_PTR(ret);
+ +      }
+ +
+ +      if (!ck)
+ +              return NULL;
+ +
+ +      INIT_LIST_HEAD(&ck->list);
+ +      bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
+ +
+ +      ck->c.cached = true;
+ +      BUG_ON(!six_trylock_intent(&ck->c.lock));
+ +      BUG_ON(!six_trylock_write(&ck->c.lock));
+ +      *was_new = true;
+ +      return ck;
+ +}
+ +
+ +static struct bkey_cached *
+ +bkey_cached_reuse(struct btree_key_cache *c)
+ +{
+ +      struct bucket_table *tbl;
+ +      struct rhash_head *pos;
+ +      struct bkey_cached *ck;
+ +      unsigned i;
+ +
+ +      mutex_lock(&c->lock);
+ +      rcu_read_lock();
+ +      tbl = rht_dereference_rcu(c->table.tbl, &c->table);
+ +      for (i = 0; i < tbl->size; i++)
+ +              rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+ +                      if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+ +                          bkey_cached_lock_for_evict(ck)) {
+ +                              bkey_cached_evict(c, ck);
+ +                              goto out;
+ +                      }
+ +              }
+ +      ck = NULL;
+ +out:
+ +      rcu_read_unlock();
+ +      mutex_unlock(&c->lock);
+ +      return ck;
+ +}
+ +
+ +static struct bkey_cached *
+ +btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
+ +{
+ +      struct bch_fs *c = trans->c;
+ +      struct btree_key_cache *bc = &c->btree_key_cache;
+ +      struct bkey_cached *ck;
+ +      bool was_new = false;
+ +
+ +      ck = bkey_cached_alloc(trans, path, &was_new);
+ +      if (IS_ERR(ck))
+ +              return ck;
+ +
+ +      if (unlikely(!ck)) {
+ +              ck = bkey_cached_reuse(bc);
+ +              if (unlikely(!ck)) {
+ +                      bch_err(c, "error allocating memory for key cache item, btree %s",
+ +                              bch2_btree_ids[path->btree_id]);
+ +                      return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
+ +              }
+ +
+ +              mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
+ +      }
+ +
+ +      ck->c.level             = 0;
+ +      ck->c.btree_id          = path->btree_id;
+ +      ck->key.btree_id        = path->btree_id;
+ +      ck->key.pos             = path->pos;
+ +      ck->valid               = false;
+ +      ck->flags               = 1U << BKEY_CACHED_ACCESSED;
+ +
+ +      if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
+ +                                        &ck->hash,
+ +                                        bch2_btree_key_cache_params))) {
+ +              /* We raced with another fill: */
+ +
+ +              if (likely(was_new)) {
+ +                      six_unlock_write(&ck->c.lock);
+ +                      six_unlock_intent(&ck->c.lock);
+ +                      kfree(ck);
+ +              } else {
+ +                      bkey_cached_free_fast(bc, ck);
+ +              }
+ +
+ +              mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
+ +              return NULL;
+ +      }
+ +
+ +      atomic_long_inc(&bc->nr_keys);
+ +
+ +      six_unlock_write(&ck->c.lock);
+ +
+ +      return ck;
+ +}
+ +
+ +static int btree_key_cache_fill(struct btree_trans *trans,
+ +                              struct btree_path *ck_path,
+ +                              struct bkey_cached *ck)
+ +{
+ +      struct btree_iter iter;
+ +      struct bkey_s_c k;
+ +      unsigned new_u64s = 0;
+ +      struct bkey_i *new_k = NULL;
+ +      int ret;
+ +
+ +      k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos,
+ +                             BTREE_ITER_KEY_CACHE_FILL|
+ +                             BTREE_ITER_CACHED_NOFILL);
+ +      ret = bkey_err(k);
+ +      if (ret)
+ +              goto err;
+ +
+ +      if (!bch2_btree_node_relock(trans, ck_path, 0)) {
+ +              trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
+ +              ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
+ +              goto err;
+ +      }
+ +
+ +      /*
+ +       * bch2_varint_decode can read past the end of the buffer by at
+ +       * most 7 bytes (it won't be used):
+ +       */
+ +      new_u64s = k.k->u64s + 1;
+ +
+ +      /*
+ +       * Allocate some extra space so that the transaction commit path is less
+ +       * likely to have to reallocate, since that requires a transaction
+ +       * restart:
+ +       */
+ +      new_u64s = min(256U, (new_u64s * 3) / 2);
+ +
+ +      if (new_u64s > ck->u64s) {
+ +              new_u64s = roundup_pow_of_two(new_u64s);
+ +              new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
+ +              if (!new_k) {
+ +                      bch2_trans_unlock(trans);
+ +
+ +                      new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
+ +                      if (!new_k) {
+ +                              bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+ +                                      bch2_btree_ids[ck->key.btree_id], new_u64s);
+ +                              ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
+ +                              goto err;
+ +                      }
+ +
+ +                      if (!bch2_btree_node_relock(trans, ck_path, 0)) {
+ +                              kfree(new_k);
+ +                              trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
+ +                              ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
+ +                              goto err;
+ +                      }
+ +
+ +                      ret = bch2_trans_relock(trans);
+ +                      if (ret) {
+ +                              kfree(new_k);
+ +                              goto err;
+ +                      }
+ +              }
+ +      }
+ +
+ +      ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
+ +      if (ret) {
+ +              kfree(new_k);
+ +              goto err;
+ +      }
+ +
+ +      if (new_k) {
+ +              kfree(ck->k);
+ +              ck->u64s = new_u64s;
+ +              ck->k = new_k;
+ +      }
+ +
+ +      bkey_reassemble(ck->k, k);
+ +      ck->valid = true;
+ +      bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
+ +
+ +      /* We're not likely to need this iterator again: */
+ +      set_btree_iter_dontneed(&iter);
+ +err:
+ +      bch2_trans_iter_exit(trans, &iter);
+ +      return ret;
+ +}
+ +
+ +static noinline int
+ +bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
+ +                                       unsigned flags)
+ +{
+ +      struct bch_fs *c = trans->c;
+ +      struct bkey_cached *ck;
+ +      int ret = 0;
+ +
+ +      BUG_ON(path->level);
+ +
+ +      path->l[1].b = NULL;
+ +
+ +      if (bch2_btree_node_relock_notrace(trans, path, 0)) {
+ +              ck = (void *) path->l[0].b;
+ +              goto fill;
+ +      }
+ +retry:
+ +      ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
+ +      if (!ck) {
+ +              ck = btree_key_cache_create(trans, path);
+ +              ret = PTR_ERR_OR_ZERO(ck);
+ +              if (ret)
+ +                      goto err;
+ +              if (!ck)
+ +                      goto retry;
+ +
+ +              mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
+ +              path->locks_want = 1;
+ +      } else {
+ +              enum six_lock_type lock_want = __btree_lock_want(path, 0);
+ +
+ +              ret = btree_node_lock(trans, path, (void *) ck, 0,
+ +                                    lock_want, _THIS_IP_);
+ +              if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ +                      goto err;
+ +
+ +              BUG_ON(ret);
+ +
+ +              if (ck->key.btree_id != path->btree_id ||
+ +                  !bpos_eq(ck->key.pos, path->pos)) {
+ +                      six_unlock_type(&ck->c.lock, lock_want);
+ +                      goto retry;
+ +              }
+ +
+ +              mark_btree_node_locked(trans, path, 0,
+ +                                     (enum btree_node_locked_type) lock_want);
+ +      }
+ +
+ +      path->l[0].lock_seq     = six_lock_seq(&ck->c.lock);
+ +      path->l[0].b            = (void *) ck;
+ +fill:
+ +      path->uptodate = BTREE_ITER_UPTODATE;
+ +
+ +      if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
+ +              /*
+ +               * Using the underscore version because we haven't set
+ +               * path->uptodate yet:
+ +               */
+ +              if (!path->locks_want &&
+ +                  !__bch2_btree_path_upgrade(trans, path, 1)) {
+ +                      trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
+ +                      ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
+ +                      goto err;
+ +              }
+ +
+ +              ret = btree_key_cache_fill(trans, path, ck);
+ +              if (ret)
+ +                      goto err;
+ +
+ +              ret = bch2_btree_path_relock(trans, path, _THIS_IP_);
+ +              if (ret)
+ +                      goto err;
+ +
+ +              path->uptodate = BTREE_ITER_UPTODATE;
+ +      }
+ +
+ +      if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ +              set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+ +
+ +      BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
+ +      BUG_ON(path->uptodate);
+ +
+ +      return ret;
+ +err:
+ +      path->uptodate = BTREE_ITER_NEED_TRAVERSE;
+ +      if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ +              btree_node_unlock(trans, path, 0);
+ +              path->l[0].b = ERR_PTR(ret);
+ +      }
+ +      return ret;
+ +}
+ +
+ +int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
+ +                                  unsigned flags)
+ +{
+ +      struct bch_fs *c = trans->c;
+ +      struct bkey_cached *ck;
+ +      int ret = 0;
+ +
+ +      EBUG_ON(path->level);
+ +
+ +      path->l[1].b = NULL;
+ +
+ +      if (bch2_btree_node_relock_notrace(trans, path, 0)) {
+ +              ck = (void *) path->l[0].b;
+ +              goto fill;
+ +      }
+ +retry:
+ +      ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
+ +      if (!ck) {
+ +              return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
+ +      } else {
+ +              enum six_lock_type lock_want = __btree_lock_want(path, 0);
+ +
+ +              ret = btree_node_lock(trans, path, (void *) ck, 0,
+ +                                    lock_want, _THIS_IP_);
+ +              EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
+ +
+ +              if (ret)
+ +                      return ret;
+ +
+ +              if (ck->key.btree_id != path->btree_id ||
+ +                  !bpos_eq(ck->key.pos, path->pos)) {
+ +                      six_unlock_type(&ck->c.lock, lock_want);
+ +                      goto retry;
+ +              }
+ +
+ +              mark_btree_node_locked(trans, path, 0,
+ +                                     (enum btree_node_locked_type) lock_want);
+ +      }
+ +
+ +      path->l[0].lock_seq     = six_lock_seq(&ck->c.lock);
+ +      path->l[0].b            = (void *) ck;
+ +fill:
+ +      if (!ck->valid)
+ +              return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
+ +
+ +      if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ +              set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+ +
+ +      path->uptodate = BTREE_ITER_UPTODATE;
+ +      EBUG_ON(!ck->valid);
+ +      EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
+ +
+ +      return ret;
+ +}
+ +
+ +static int btree_key_cache_flush_pos(struct btree_trans *trans,
+ +                                   struct bkey_cached_key key,
+ +                                   u64 journal_seq,
+ +                                   unsigned commit_flags,
+ +                                   bool evict)
+ +{
+ +      struct bch_fs *c = trans->c;
+ +      struct journal *j = &c->journal;
+ +      struct btree_iter c_iter, b_iter;
+ +      struct bkey_cached *ck = NULL;
+ +      int ret;
+ +
+ +      bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
+ +                           BTREE_ITER_SLOTS|
+ +                           BTREE_ITER_INTENT|
+ +                           BTREE_ITER_ALL_SNAPSHOTS);
+ +      bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
+ +                           BTREE_ITER_CACHED|
+ +                           BTREE_ITER_INTENT);
+ +      b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+ +
+ +      ret = bch2_btree_iter_traverse(&c_iter);
+ +      if (ret)
+ +              goto out;
+ +
+ +      ck = (void *) c_iter.path->l[0].b;
+ +      if (!ck)
+ +              goto out;
+ +
+ +      if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ +              if (evict)
+ +                      goto evict;
+ +              goto out;
+ +      }
+ +
+ +      BUG_ON(!ck->valid);
+ +
+ +      if (journal_seq && ck->journal.seq != journal_seq)
+ +              goto out;
+ +
+ +      /*
+ +       * Since journal reclaim depends on us making progress here, and the
+ +       * allocator/copygc depend on journal reclaim making progress, we need
+ +       * to be using alloc reserves:
+ +       */
+ +      ret   = bch2_btree_iter_traverse(&b_iter) ?:
+ +              bch2_trans_update(trans, &b_iter, ck->k,
+ +                                BTREE_UPDATE_KEY_CACHE_RECLAIM|
+ +                                BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ +                                BTREE_TRIGGER_NORUN) ?:
+ +              bch2_trans_commit(trans, NULL, NULL,
+ +                                BTREE_INSERT_NOCHECK_RW|
+ +                                BTREE_INSERT_NOFAIL|
+ +                                (ck->journal.seq == journal_last_seq(j)
+ +                                 ? BCH_WATERMARK_reclaim
+ +                                 : 0)|
+ +                                commit_flags);
+ +
+ +      bch2_fs_fatal_err_on(ret &&
+ +                           !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+ +                           !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
+ +                           !bch2_journal_error(j), c,
+ +                           "error flushing key cache: %s", bch2_err_str(ret));
+ +      if (ret)
+ +              goto out;
+ +
+ +      bch2_journal_pin_drop(j, &ck->journal);
+ +      bch2_journal_preres_put(j, &ck->res);
+ +
+ +      BUG_ON(!btree_node_locked(c_iter.path, 0));
+ +
+ +      if (!evict) {
+ +              if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ +                      clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ +                      atomic_long_dec(&c->btree_key_cache.nr_dirty);
+ +              }
+ +      } else {
+ +              struct btree_path *path2;
+ +evict:
+ +              trans_for_each_path(trans, path2)
+ +                      if (path2 != c_iter.path)
+ +                              __bch2_btree_path_unlock(trans, path2);
+ +
+ +              bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c);
+ +
+ +              if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ +                      clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ +                      atomic_long_dec(&c->btree_key_cache.nr_dirty);
+ +              }
+ +
+ +              mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED);
+ +              bkey_cached_evict(&c->btree_key_cache, ck);
+ +              bkey_cached_free_fast(&c->btree_key_cache, ck);
+ +      }
+ +out:
+ +      bch2_trans_iter_exit(trans, &b_iter);
+ +      bch2_trans_iter_exit(trans, &c_iter);
+ +      return ret;
+ +}
+ +
+ +int bch2_btree_key_cache_journal_flush(struct journal *j,
+ +                              struct journal_entry_pin *pin, u64 seq)
+ +{
+ +      struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ +      struct bkey_cached *ck =
+ +              container_of(pin, struct bkey_cached, journal);
+ +      struct bkey_cached_key key;
+ +      struct btree_trans *trans = bch2_trans_get(c);
+ +      int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+ +      int ret = 0;
+ +
+ +      btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
+ +      key = ck->key;
+ +
+ +      if (ck->journal.seq != seq ||
+ +          !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ +              six_unlock_read(&ck->c.lock);
+ +              goto unlock;
+ +      }
+ +
+ +      if (ck->seq != seq) {
+ +              bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal,
+ +                                      bch2_btree_key_cache_journal_flush);
+ +              six_unlock_read(&ck->c.lock);
+ +              goto unlock;
+ +      }
+ +      six_unlock_read(&ck->c.lock);
+ +
+ +      ret = commit_do(trans, NULL, NULL, 0,
+ +              btree_key_cache_flush_pos(trans, key, seq,
+ +                              BTREE_INSERT_JOURNAL_RECLAIM, false));
+ +unlock:
+ +      srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+ +
+ +      bch2_trans_put(trans);
+ +      return ret;
+ +}
+ +
+ +/*
+ + * Flush and evict a key from the key cache:
+ + */
+ +int bch2_btree_key_cache_flush(struct btree_trans *trans,
+ +                             enum btree_id id, struct bpos pos)
+ +{
+ +      struct bch_fs *c = trans->c;
+ +      struct bkey_cached_key key = { id, pos };
+ +
+ +      /* Fastpath - assume it won't be found: */
+ +      if (!bch2_btree_key_cache_find(c, id, pos))
+ +              return 0;
+ +
+ +      return btree_key_cache_flush_pos(trans, key, 0, 0, true);
+ +}
+ +
+ +bool bch2_btree_insert_key_cached(struct btree_trans *trans,
+ +                                unsigned flags,
+ +                                struct btree_insert_entry *insert_entry)
+ +{
+ +      struct bch_fs *c = trans->c;
+ +      struct bkey_cached *ck = (void *) insert_entry->path->l[0].b;
+ +      struct bkey_i *insert = insert_entry->k;
+ +      bool kick_reclaim = false;
+ +
+ +      BUG_ON(insert->k.u64s > ck->u64s);
+ +
+ +      if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ +              int difference;
+ +
+ +              BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s);
+ +
+ +              difference = jset_u64s(insert->k.u64s) - ck->res.u64s;
+ +              if (difference > 0) {
+ +                      trans->journal_preres.u64s      -= difference;
+ +                      ck->res.u64s                    += difference;
+ +              }
+ +      }
+ +
+ +      bkey_copy(ck->k, insert);
+ +      ck->valid = true;
+ +
+ +      if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ +              EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+ +              set_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ +              atomic_long_inc(&c->btree_key_cache.nr_dirty);
+ +
+ +              if (bch2_nr_btree_keys_need_flush(c))
+ +                      kick_reclaim = true;
+ +      }
+ +
+ +      /*
+ +       * To minimize lock contention, we only add the journal pin here and
+ +       * defer pin updates to the flush callback via ->seq. Be careful not to
+ +       * update ->seq on nojournal commits because we don't want to update the
+ +       * pin to a seq that doesn't include journal updates on disk. Otherwise
+ +       * we risk losing the update after a crash.
+ +       *
+ +       * The only exception is if the pin is not active in the first place. We
+ +       * have to add the pin because journal reclaim drives key cache
+ +       * flushing. The flush callback will not proceed unless ->seq matches
+ +       * the latest pin, so make sure it starts with a consistent value.
+ +       */
+ +      if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) ||
+ +          !journal_pin_active(&ck->journal)) {
+ +              ck->seq = trans->journal_res.seq;
+ +      }
+ +      bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
+ +                           &ck->journal, bch2_btree_key_cache_journal_flush);
+ +
+ +      if (kick_reclaim)
+ +              journal_reclaim_kick(&c->journal);
+ +      return true;
+ +}
+ +
+ +void bch2_btree_key_cache_drop(struct btree_trans *trans,
+ +                             struct btree_path *path)
+ +{
+ +      struct bch_fs *c = trans->c;
+ +      struct bkey_cached *ck = (void *) path->l[0].b;
+ +
+ +      BUG_ON(!ck->valid);
+ +
+ +      /*
+ +       * We just did an update to the btree, bypassing the key cache: the key
+ +       * cache key is now stale and must be dropped, even if dirty:
+ +       */
+ +      if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ +              clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ +              atomic_long_dec(&c->btree_key_cache.nr_dirty);
+ +              bch2_journal_pin_drop(&c->journal, &ck->journal);
+ +      }
+ +
+ +      ck->valid = false;
+ +}
+ +
+ +static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
+ +                                         struct shrink_control *sc)
+ +{
-       struct bch_fs *c = container_of(shrink, struct bch_fs,
-                                       btree_key_cache.shrink);
++      struct bch_fs *c = shrink->private_data;
+ +      struct btree_key_cache *bc = &c->btree_key_cache;
+ +      struct bucket_table *tbl;
+ +      struct bkey_cached *ck, *t;
+ +      size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
+ +      unsigned start, flags;
+ +      int srcu_idx;
+ +
+ +      mutex_lock(&bc->lock);
+ +      srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+ +      flags = memalloc_nofs_save();
+ +
+ +      /*
+ +       * Newest freed entries are at the end of the list - once we hit one
+ +       * that's too new to be freed, we can bail out:
+ +       */
+ +      list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
+ +              if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+ +                                               ck->btree_trans_barrier_seq))
+ +                      break;
+ +
+ +              list_del(&ck->list);
+ +              six_lock_exit(&ck->c.lock);
+ +              kmem_cache_free(bch2_key_cache, ck);
+ +              atomic_long_dec(&bc->nr_freed);
+ +              scanned++;
+ +              freed++;
+ +      }
+ +
+ +      if (scanned >= nr)
+ +              goto out;
+ +
+ +      list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
+ +              if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+ +                                               ck->btree_trans_barrier_seq))
+ +                      break;
+ +
+ +              list_del(&ck->list);
+ +              six_lock_exit(&ck->c.lock);
+ +              kmem_cache_free(bch2_key_cache, ck);
+ +              atomic_long_dec(&bc->nr_freed);
+ +              scanned++;
+ +              freed++;
+ +      }
+ +
+ +      if (scanned >= nr)
+ +              goto out;
+ +
+ +      rcu_read_lock();
+ +      tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+ +      if (bc->shrink_iter >= tbl->size)
+ +              bc->shrink_iter = 0;
+ +      start = bc->shrink_iter;
+ +
+ +      do {
+ +              struct rhash_head *pos, *next;
+ +
+ +              pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
+ +
+ +              while (!rht_is_a_nulls(pos)) {
+ +                      next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
+ +                      ck = container_of(pos, struct bkey_cached, hash);
+ +
+ +                      if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
+ +                              goto next;
+ +
+ +                      if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ +                              clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+ +                      else if (bkey_cached_lock_for_evict(ck)) {
+ +                              bkey_cached_evict(bc, ck);
+ +                              bkey_cached_free(bc, ck);
+ +                      }
+ +
+ +                      scanned++;
+ +                      if (scanned >= nr)
+ +                              break;
+ +next:
+ +                      pos = next;
+ +              }
+ +
+ +              bc->shrink_iter++;
+ +              if (bc->shrink_iter >= tbl->size)
+ +                      bc->shrink_iter = 0;
+ +      } while (scanned < nr && bc->shrink_iter != start);
+ +
+ +      rcu_read_unlock();
+ +out:
+ +      memalloc_nofs_restore(flags);
+ +      srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+ +      mutex_unlock(&bc->lock);
+ +
+ +      return freed;
+ +}
+ +
+ +static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
+ +                                          struct shrink_control *sc)
+ +{
-       unregister_shrinker(&bc->shrink);
++      struct bch_fs *c = shrink->private_data;
+ +      struct btree_key_cache *bc = &c->btree_key_cache;
+ +      long nr = atomic_long_read(&bc->nr_keys) -
+ +              atomic_long_read(&bc->nr_dirty);
+ +
+ +      return max(0L, nr);
+ +}
+ +
+ +void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
+ +{
+ +      struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+ +      struct bucket_table *tbl;
+ +      struct bkey_cached *ck, *n;
+ +      struct rhash_head *pos;
+ +      LIST_HEAD(items);
+ +      unsigned i;
+ +#ifdef __KERNEL__
+ +      int cpu;
+ +#endif
+ +
-       bc->shrink.seeks                = 0;
-       bc->shrink.count_objects        = bch2_btree_key_cache_count;
-       bc->shrink.scan_objects         = bch2_btree_key_cache_scan;
-       if (register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name))
++      shrinker_free(bc->shrink);
+ +
+ +      mutex_lock(&bc->lock);
+ +
+ +      /*
+ +       * The loop is needed to guard against racing with rehash:
+ +       */
+ +      while (atomic_long_read(&bc->nr_keys)) {
+ +              rcu_read_lock();
+ +              tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+ +              if (tbl)
+ +                      for (i = 0; i < tbl->size; i++)
+ +                              rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+ +                                      bkey_cached_evict(bc, ck);
+ +                                      list_add(&ck->list, &items);
+ +                              }
+ +              rcu_read_unlock();
+ +      }
+ +
+ +#ifdef __KERNEL__
+ +      for_each_possible_cpu(cpu) {
+ +              struct btree_key_cache_freelist *f =
+ +                      per_cpu_ptr(bc->pcpu_freed, cpu);
+ +
+ +              for (i = 0; i < f->nr; i++) {
+ +                      ck = f->objs[i];
+ +                      list_add(&ck->list, &items);
+ +              }
+ +      }
+ +#endif
+ +
+ +      list_splice(&bc->freed_pcpu,    &items);
+ +      list_splice(&bc->freed_nonpcpu, &items);
+ +
+ +      mutex_unlock(&bc->lock);
+ +
+ +      list_for_each_entry_safe(ck, n, &items, list) {
+ +              cond_resched();
+ +
+ +              bch2_journal_pin_drop(&c->journal, &ck->journal);
+ +              bch2_journal_preres_put(&c->journal, &ck->res);
+ +
+ +              list_del(&ck->list);
+ +              kfree(ck->k);
+ +              six_lock_exit(&ck->c.lock);
+ +              kmem_cache_free(bch2_key_cache, ck);
+ +      }
+ +
+ +      if (atomic_long_read(&bc->nr_dirty) &&
+ +          !bch2_journal_error(&c->journal) &&
+ +          test_bit(BCH_FS_WAS_RW, &c->flags))
+ +              panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
+ +                    atomic_long_read(&bc->nr_dirty));
+ +
+ +      if (atomic_long_read(&bc->nr_keys))
+ +              panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
+ +                    atomic_long_read(&bc->nr_keys));
+ +
+ +      if (bc->table_init_done)
+ +              rhashtable_destroy(&bc->table);
+ +
+ +      free_percpu(bc->pcpu_freed);
+ +}
+ +
+ +void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
+ +{
+ +      mutex_init(&c->lock);
+ +      INIT_LIST_HEAD(&c->freed_pcpu);
+ +      INIT_LIST_HEAD(&c->freed_nonpcpu);
+ +}
+ +
+ +int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
+ +{
+ +      struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
++      struct shrinker *shrink;
+ +
+ +#ifdef __KERNEL__
+ +      bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
+ +      if (!bc->pcpu_freed)
+ +              return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ +#endif
+ +
+ +      if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
+ +              return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ +
+ +      bc->table_init_done = true;
+ +
++      shrink = shrinker_alloc(0, "%s/btree_key_cache", c->name);
++      if (!shrink)
+ +              return -BCH_ERR_ENOMEM_fs_btree_cache_init;
++      bc->shrink = shrink;
++      shrink->seeks           = 0;
++      shrink->count_objects   = bch2_btree_key_cache_count;
++      shrink->scan_objects    = bch2_btree_key_cache_scan;
++      shrink->private_data    = c;
++      shrinker_register(shrink);
+ +      return 0;
+ +}
+ +
+ +void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
+ +{
+ +      prt_printf(out, "nr_freed:\t%lu",       atomic_long_read(&c->nr_freed));
+ +      prt_newline(out);
+ +      prt_printf(out, "nr_keys:\t%lu",        atomic_long_read(&c->nr_keys));
+ +      prt_newline(out);
+ +      prt_printf(out, "nr_dirty:\t%lu",       atomic_long_read(&c->nr_dirty));
+ +      prt_newline(out);
+ +}
+ +
+ +void bch2_btree_key_cache_exit(void)
+ +{
+ +      kmem_cache_destroy(bch2_key_cache);
+ +}
+ +
+ +int __init bch2_btree_key_cache_init(void)
+ +{
+ +      bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT);
+ +      if (!bch2_key_cache)
+ +              return -ENOMEM;
+ +
+ +      return 0;
+ +}
diff --cc fs/bcachefs/btree_types.h

index c9a38e254949ec2b4aa400d253a52e099c1b0292,0000000000000000000000000000000000000000..bc6714d88925f3183ed9d817f8a79f106e55da7c

mode 100644,000000..100644
--- 1/fs/bcachefs/btree_types.h
--- /dev/null
+++ b/fs/bcachefs/btree_types.h
@@@ -1,739 -1,0 +1,739 @@@
-       struct shrinker         shrink;
+ +/* SPDX-License-Identifier: GPL-2.0 */
+ +#ifndef _BCACHEFS_BTREE_TYPES_H
+ +#define _BCACHEFS_BTREE_TYPES_H
+ +
+ +#include <linux/list.h>
+ +#include <linux/rhashtable.h>
+ +
+ +//#include "bkey_methods.h"
+ +#include "buckets_types.h"
+ +#include "darray.h"
+ +#include "errcode.h"
+ +#include "journal_types.h"
+ +#include "replicas_types.h"
+ +#include "six.h"
+ +
+ +struct open_bucket;
+ +struct btree_update;
+ +struct btree_trans;
+ +
+ +#define MAX_BSETS             3U
+ +
+ +struct btree_nr_keys {
+ +
+ +      /*
+ +       * Amount of live metadata (i.e. size of node after a compaction) in
+ +       * units of u64s
+ +       */
+ +      u16                     live_u64s;
+ +      u16                     bset_u64s[MAX_BSETS];
+ +
+ +      /* live keys only: */
+ +      u16                     packed_keys;
+ +      u16                     unpacked_keys;
+ +};
+ +
+ +struct bset_tree {
+ +      /*
+ +       * We construct a binary tree in an array as if the array
+ +       * started at 1, so that things line up on the same cachelines
+ +       * better: see comments in bset.c at cacheline_to_bkey() for
+ +       * details
+ +       */
+ +
+ +      /* size of the binary tree and prev array */
+ +      u16                     size;
+ +
+ +      /* function of size - precalculated for to_inorder() */
+ +      u16                     extra;
+ +
+ +      u16                     data_offset;
+ +      u16                     aux_data_offset;
+ +      u16                     end_offset;
+ +};
+ +
+ +struct btree_write {
+ +      struct journal_entry_pin        journal;
+ +};
+ +
+ +struct btree_alloc {
+ +      struct open_buckets     ob;
+ +      __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
+ +};
+ +
+ +struct btree_bkey_cached_common {
+ +      struct six_lock         lock;
+ +      u8                      level;
+ +      u8                      btree_id;
+ +      bool                    cached;
+ +};
+ +
+ +struct btree {
+ +      struct btree_bkey_cached_common c;
+ +
+ +      struct rhash_head       hash;
+ +      u64                     hash_val;
+ +
+ +      unsigned long           flags;
+ +      u16                     written;
+ +      u8                      nsets;
+ +      u8                      nr_key_bits;
+ +      u16                     version_ondisk;
+ +
+ +      struct bkey_format      format;
+ +
+ +      struct btree_node       *data;
+ +      void                    *aux_data;
+ +
+ +      /*
+ +       * Sets of sorted keys - the real btree node - plus a binary search tree
+ +       *
+ +       * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+ +       * to the memory we have allocated for this btree node. Additionally,
+ +       * set[0]->data points to the entire btree node as it exists on disk.
+ +       */
+ +      struct bset_tree        set[MAX_BSETS];
+ +
+ +      struct btree_nr_keys    nr;
+ +      u16                     sib_u64s[2];
+ +      u16                     whiteout_u64s;
+ +      u8                      byte_order;
+ +      u8                      unpack_fn_len;
+ +
+ +      struct btree_write      writes[2];
+ +
+ +      /* Key/pointer for this btree node */
+ +      __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+ +
+ +      /*
+ +       * XXX: add a delete sequence number, so when bch2_btree_node_relock()
+ +       * fails because the lock sequence number has changed - i.e. the
+ +       * contents were modified - we can still relock the node if it's still
+ +       * the one we want, without redoing the traversal
+ +       */
+ +
+ +      /*
+ +       * For asynchronous splits/interior node updates:
+ +       * When we do a split, we allocate new child nodes and update the parent
+ +       * node to point to them: we update the parent in memory immediately,
+ +       * but then we must wait until the children have been written out before
+ +       * the update to the parent can be written - this is a list of the
+ +       * btree_updates that are blocking this node from being
+ +       * written:
+ +       */
+ +      struct list_head        write_blocked;
+ +
+ +      /*
+ +       * Also for asynchronous splits/interior node updates:
+ +       * If a btree node isn't reachable yet, we don't want to kick off
+ +       * another write - because that write also won't yet be reachable and
+ +       * marking it as completed before it's reachable would be incorrect:
+ +       */
+ +      unsigned long           will_make_reachable;
+ +
+ +      struct open_buckets     ob;
+ +
+ +      /* lru list */
+ +      struct list_head        list;
+ +};
+ +
+ +struct btree_cache {
+ +      struct rhashtable       table;
+ +      bool                    table_init_done;
+ +      /*
+ +       * We never free a struct btree, except on shutdown - we just put it on
+ +       * the btree_cache_freed list and reuse it later. This simplifies the
+ +       * code, and it doesn't cost us much memory as the memory usage is
+ +       * dominated by buffers that hold the actual btree node data and those
+ +       * can be freed - and the number of struct btrees allocated is
+ +       * effectively bounded.
+ +       *
+ +       * btree_cache_freeable effectively is a small cache - we use it because
+ +       * high order page allocations can be rather expensive, and it's quite
+ +       * common to delete and allocate btree nodes in quick succession. It
+ +       * should never grow past ~2-3 nodes in practice.
+ +       */
+ +      struct mutex            lock;
+ +      struct list_head        live;
+ +      struct list_head        freeable;
+ +      struct list_head        freed_pcpu;
+ +      struct list_head        freed_nonpcpu;
+ +
+ +      /* Number of elements in live + freeable lists */
+ +      unsigned                used;
+ +      unsigned                reserve;
+ +      atomic_t                dirty;
-       struct shrinker         shrink;
++      struct shrinker         *shrink;
+ +
+ +      /*
+ +       * If we need to allocate memory for a new btree node and that
+ +       * allocation fails, we can cannibalize another node in the btree cache
+ +       * to satisfy the allocation - lock to guarantee only one thread does
+ +       * this at a time:
+ +       */
+ +      struct task_struct      *alloc_lock;
+ +      struct closure_waitlist alloc_wait;
+ +};
+ +
+ +struct btree_node_iter {
+ +      struct btree_node_iter_set {
+ +              u16     k, end;
+ +      } data[MAX_BSETS];
+ +};
+ +
+ +/*
+ + * Iterate over all possible positions, synthesizing deleted keys for holes:
+ + */
+ +static const __maybe_unused u16 BTREE_ITER_SLOTS              = 1 << 0;
+ +static const __maybe_unused u16 BTREE_ITER_ALL_LEVELS         = 1 << 1;
+ +/*
+ + * Indicates that intent locks should be taken on leaf nodes, because we expect
+ + * to be doing updates:
+ + */
+ +static const __maybe_unused u16 BTREE_ITER_INTENT             = 1 << 2;
+ +/*
+ + * Causes the btree iterator code to prefetch additional btree nodes from disk:
+ + */
+ +static const __maybe_unused u16 BTREE_ITER_PREFETCH           = 1 << 3;
+ +/*
+ + * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
+ + * @pos or the first key strictly greater than @pos
+ + */
+ +static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS         = 1 << 4;
+ +static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS                = 1 << 5;
+ +static const __maybe_unused u16 BTREE_ITER_CACHED             = 1 << 6;
+ +static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE     = 1 << 7;
+ +static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES               = 1 << 8;
+ +static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL               = 1 << 9;
+ +static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS    = 1 << 10;
+ +static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS      = 1 << 11;
+ +static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS   = 1 << 12;
+ +static const __maybe_unused u16 BTREE_ITER_NOPRESERVE         = 1 << 13;
+ +static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL      = 1 << 14;
+ +static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL     = 1 << 15;
+ +#define __BTREE_ITER_FLAGS_END                                               16
+ +
+ +enum btree_path_uptodate {
+ +      BTREE_ITER_UPTODATE             = 0,
+ +      BTREE_ITER_NEED_RELOCK          = 1,
+ +      BTREE_ITER_NEED_TRAVERSE        = 2,
+ +};
+ +
+ +#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG)
+ +#define TRACK_PATH_ALLOCATED
+ +#endif
+ +
+ +struct btree_path {
+ +      u8                      idx;
+ +      u8                      sorted_idx;
+ +      u8                      ref;
+ +      u8                      intent_ref;
+ +
+ +      /* btree_iter_copy starts here: */
+ +      struct bpos             pos;
+ +
+ +      enum btree_id           btree_id:5;
+ +      bool                    cached:1;
+ +      bool                    preserve:1;
+ +      enum btree_path_uptodate uptodate:2;
+ +      /*
+ +       * When true, failing to relock this path will cause the transaction to
+ +       * restart:
+ +       */
+ +      bool                    should_be_locked:1;
+ +      unsigned                level:3,
+ +                              locks_want:3;
+ +      u8                      nodes_locked;
+ +
+ +      struct btree_path_level {
+ +              struct btree    *b;
+ +              struct btree_node_iter iter;
+ +              u32             lock_seq;
+ +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+ +              u64             lock_taken_time;
+ +#endif
+ +      }                       l[BTREE_MAX_DEPTH];
+ +#ifdef TRACK_PATH_ALLOCATED
+ +      unsigned long           ip_allocated;
+ +#endif
+ +};
+ +
+ +static inline struct btree_path_level *path_l(struct btree_path *path)
+ +{
+ +      return path->l + path->level;
+ +}
+ +
+ +static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
+ +{
+ +#ifdef TRACK_PATH_ALLOCATED
+ +      return path->ip_allocated;
+ +#else
+ +      return _THIS_IP_;
+ +#endif
+ +}
+ +
+ +/*
+ + * @pos                       - iterator's current position
+ + * @level             - current btree depth
+ + * @locks_want                - btree level below which we start taking intent locks
+ + * @nodes_locked      - bitmask indicating which nodes in @nodes are locked
+ + * @nodes_intent_locked       - bitmask indicating which locks are intent locks
+ + */
+ +struct btree_iter {
+ +      struct btree_trans      *trans;
+ +      struct btree_path       *path;
+ +      struct btree_path       *update_path;
+ +      struct btree_path       *key_cache_path;
+ +
+ +      enum btree_id           btree_id:8;
+ +      unsigned                min_depth:3;
+ +      unsigned                advanced:1;
+ +
+ +      /* btree_iter_copy starts here: */
+ +      u16                     flags;
+ +
+ +      /* When we're filtering by snapshot, the snapshot ID we're looking for: */
+ +      unsigned                snapshot;
+ +
+ +      struct bpos             pos;
+ +      /*
+ +       * Current unpacked key - so that bch2_btree_iter_next()/
+ +       * bch2_btree_iter_next_slot() can correctly advance pos.
+ +       */
+ +      struct bkey             k;
+ +
+ +      /* BTREE_ITER_WITH_JOURNAL: */
+ +      size_t                  journal_idx;
+ +      struct bpos             journal_pos;
+ +#ifdef TRACK_PATH_ALLOCATED
+ +      unsigned long           ip_allocated;
+ +#endif
+ +};
+ +
+ +struct btree_key_cache_freelist {
+ +      struct bkey_cached      *objs[16];
+ +      unsigned                nr;
+ +};
+ +
+ +struct btree_key_cache {
+ +      struct mutex            lock;
+ +      struct rhashtable       table;
+ +      bool                    table_init_done;
+ +      struct list_head        freed_pcpu;
+ +      struct list_head        freed_nonpcpu;
++      struct shrinker         *shrink;
+ +      unsigned                shrink_iter;
+ +      struct btree_key_cache_freelist __percpu *pcpu_freed;
+ +
+ +      atomic_long_t           nr_freed;
+ +      atomic_long_t           nr_keys;
+ +      atomic_long_t           nr_dirty;
+ +};
+ +
+ +struct bkey_cached_key {
+ +      u32                     btree_id;
+ +      struct bpos             pos;
+ +} __packed __aligned(4);
+ +
+ +#define BKEY_CACHED_ACCESSED          0
+ +#define BKEY_CACHED_DIRTY             1
+ +
+ +struct bkey_cached {
+ +      struct btree_bkey_cached_common c;
+ +
+ +      unsigned long           flags;
+ +      u16                     u64s;
+ +      bool                    valid;
+ +      u32                     btree_trans_barrier_seq;
+ +      struct bkey_cached_key  key;
+ +
+ +      struct rhash_head       hash;
+ +      struct list_head        list;
+ +
+ +      struct journal_preres   res;
+ +      struct journal_entry_pin journal;
+ +      u64                     seq;
+ +
+ +      struct bkey_i           *k;
+ +};
+ +
+ +static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
+ +{
+ +      return !b->cached
+ +              ? container_of(b, struct btree, c)->key.k.p
+ +              : container_of(b, struct bkey_cached, c)->key.pos;
+ +}
+ +
+ +struct btree_insert_entry {
+ +      unsigned                flags;
+ +      u8                      bkey_type;
+ +      enum btree_id           btree_id:8;
+ +      u8                      level:4;
+ +      bool                    cached:1;
+ +      bool                    insert_trigger_run:1;
+ +      bool                    overwrite_trigger_run:1;
+ +      bool                    key_cache_already_flushed:1;
+ +      /*
+ +       * @old_k may be a key from the journal; @old_btree_u64s always refers
+ +       * to the size of the key being overwritten in the btree:
+ +       */
+ +      u8                      old_btree_u64s;
+ +      struct bkey_i           *k;
+ +      struct btree_path       *path;
+ +      u64                     seq;
+ +      /* key being overwritten: */
+ +      struct bkey             old_k;
+ +      const struct bch_val    *old_v;
+ +      unsigned long           ip_allocated;
+ +};
+ +
+ +#ifndef CONFIG_LOCKDEP
+ +#define BTREE_ITER_MAX                64
+ +#else
+ +#define BTREE_ITER_MAX                32
+ +#endif
+ +
+ +struct btree_trans_commit_hook;
+ +typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
+ +
+ +struct btree_trans_commit_hook {
+ +      btree_trans_commit_hook_fn      *fn;
+ +      struct btree_trans_commit_hook  *next;
+ +};
+ +
+ +#define BTREE_TRANS_MEM_MAX   (1U << 16)
+ +
+ +#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS     10000
+ +
+ +struct btree_trans {
+ +      struct bch_fs           *c;
+ +      const char              *fn;
+ +      struct closure          ref;
+ +      struct list_head        list;
+ +      u64                     last_begin_time;
+ +
+ +      u8                      lock_may_not_fail;
+ +      u8                      lock_must_abort;
+ +      struct btree_bkey_cached_common *locking;
+ +      struct six_lock_waiter  locking_wait;
+ +
+ +      int                     srcu_idx;
+ +
+ +      u8                      fn_idx;
+ +      u8                      nr_sorted;
+ +      u8                      nr_updates;
+ +      u8                      nr_wb_updates;
+ +      u8                      wb_updates_size;
+ +      bool                    used_mempool:1;
+ +      bool                    in_traverse_all:1;
+ +      bool                    paths_sorted:1;
+ +      bool                    memory_allocation_failure:1;
+ +      bool                    journal_transaction_names:1;
+ +      bool                    journal_replay_not_finished:1;
+ +      bool                    notrace_relock_fail:1;
+ +      enum bch_errcode        restarted:16;
+ +      u32                     restart_count;
+ +      unsigned long           last_begin_ip;
+ +      unsigned long           last_restarted_ip;
+ +      unsigned long           srcu_lock_time;
+ +
+ +      /*
+ +       * For when bch2_trans_update notices we'll be splitting a compressed
+ +       * extent:
+ +       */
+ +      unsigned                extra_journal_res;
+ +      unsigned                nr_max_paths;
+ +
+ +      u64                     paths_allocated;
+ +
+ +      unsigned                mem_top;
+ +      unsigned                mem_max;
+ +      unsigned                mem_bytes;
+ +      void                    *mem;
+ +
+ +      u8                      sorted[BTREE_ITER_MAX + 8];
+ +      struct btree_path       paths[BTREE_ITER_MAX];
+ +      struct btree_insert_entry updates[BTREE_ITER_MAX];
+ +      struct btree_write_buffered_key *wb_updates;
+ +
+ +      /* update path: */
+ +      struct btree_trans_commit_hook *hooks;
+ +      darray_u64              extra_journal_entries;
+ +      struct journal_entry_pin *journal_pin;
+ +
+ +      struct journal_res      journal_res;
+ +      struct journal_preres   journal_preres;
+ +      u64                     *journal_seq;
+ +      struct disk_reservation *disk_res;
+ +      unsigned                journal_u64s;
+ +      unsigned                journal_preres_u64s;
+ +      struct replicas_delta_list *fs_usage_deltas;
+ +};
+ +
+ +#define BCH_BTREE_WRITE_TYPES()                                               \
+ +      x(initial,              0)                                      \
+ +      x(init_next_bset,       1)                                      \
+ +      x(cache_reclaim,        2)                                      \
+ +      x(journal_reclaim,      3)                                      \
+ +      x(interior,             4)
+ +
+ +enum btree_write_type {
+ +#define x(t, n) BTREE_WRITE_##t,
+ +      BCH_BTREE_WRITE_TYPES()
+ +#undef x
+ +      BTREE_WRITE_TYPE_NR,
+ +};
+ +
+ +#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
+ +#define BTREE_WRITE_TYPE_BITS ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR))
+ +
+ +#define BTREE_FLAGS()                                                 \
+ +      x(read_in_flight)                                               \
+ +      x(read_error)                                                   \
+ +      x(dirty)                                                        \
+ +      x(need_write)                                                   \
+ +      x(write_blocked)                                                \
+ +      x(will_make_reachable)                                          \
+ +      x(noevict)                                                      \
+ +      x(write_idx)                                                    \
+ +      x(accessed)                                                     \
+ +      x(write_in_flight)                                              \
+ +      x(write_in_flight_inner)                                        \
+ +      x(just_written)                                                 \
+ +      x(dying)                                                        \
+ +      x(fake)                                                         \
+ +      x(need_rewrite)                                                 \
+ +      x(never_write)
+ +
+ +enum btree_flags {
+ +      /* First bits for btree node write type */
+ +      BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1,
+ +#define x(flag)       BTREE_NODE_##flag,
+ +      BTREE_FLAGS()
+ +#undef x
+ +};
+ +
+ +#define x(flag)                                                               \
+ +static inline bool btree_node_ ## flag(struct btree *b)                       \
+ +{     return test_bit(BTREE_NODE_ ## flag, &b->flags); }              \
+ +                                                                      \
+ +static inline void set_btree_node_ ## flag(struct btree *b)           \
+ +{     set_bit(BTREE_NODE_ ## flag, &b->flags); }                      \
+ +                                                                      \
+ +static inline void clear_btree_node_ ## flag(struct btree *b)         \
+ +{     clear_bit(BTREE_NODE_ ## flag, &b->flags); }
+ +
+ +BTREE_FLAGS()
+ +#undef x
+ +
+ +static inline struct btree_write *btree_current_write(struct btree *b)
+ +{
+ +      return b->writes + btree_node_write_idx(b);
+ +}
+ +
+ +static inline struct btree_write *btree_prev_write(struct btree *b)
+ +{
+ +      return b->writes + (btree_node_write_idx(b) ^ 1);
+ +}
+ +
+ +static inline struct bset_tree *bset_tree_last(struct btree *b)
+ +{
+ +      EBUG_ON(!b->nsets);
+ +      return b->set + b->nsets - 1;
+ +}
+ +
+ +static inline void *
+ +__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
+ +{
+ +      return (void *) ((u64 *) b->data + 1 + offset);
+ +}
+ +
+ +static inline u16
+ +__btree_node_ptr_to_offset(const struct btree *b, const void *p)
+ +{
+ +      u16 ret = (u64 *) p - 1 - (u64 *) b->data;
+ +
+ +      EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
+ +      return ret;
+ +}
+ +
+ +static inline struct bset *bset(const struct btree *b,
+ +                              const struct bset_tree *t)
+ +{
+ +      return __btree_node_offset_to_ptr(b, t->data_offset);
+ +}
+ +
+ +static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
+ +{
+ +      t->end_offset =
+ +              __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
+ +}
+ +
+ +static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
+ +                                const struct bset *i)
+ +{
+ +      t->data_offset = __btree_node_ptr_to_offset(b, i);
+ +      set_btree_bset_end(b, t);
+ +}
+ +
+ +static inline struct bset *btree_bset_first(struct btree *b)
+ +{
+ +      return bset(b, b->set);
+ +}
+ +
+ +static inline struct bset *btree_bset_last(struct btree *b)
+ +{
+ +      return bset(b, bset_tree_last(b));
+ +}
+ +
+ +static inline u16
+ +__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
+ +{
+ +      return __btree_node_ptr_to_offset(b, k);
+ +}
+ +
+ +static inline struct bkey_packed *
+ +__btree_node_offset_to_key(const struct btree *b, u16 k)
+ +{
+ +      return __btree_node_offset_to_ptr(b, k);
+ +}
+ +
+ +static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
+ +{
+ +      return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
+ +}
+ +
+ +#define btree_bkey_first(_b, _t)                                      \
+ +({                                                                    \
+ +      EBUG_ON(bset(_b, _t)->start !=                                  \
+ +              __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
+ +                                                                      \
+ +      bset(_b, _t)->start;                                            \
+ +})
+ +
+ +#define btree_bkey_last(_b, _t)                                               \
+ +({                                                                    \
+ +      EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) !=     \
+ +              vstruct_last(bset(_b, _t)));                            \
+ +                                                                      \
+ +      __btree_node_offset_to_key(_b, (_t)->end_offset);               \
+ +})
+ +
+ +static inline unsigned bset_u64s(struct bset_tree *t)
+ +{
+ +      return t->end_offset - t->data_offset -
+ +              sizeof(struct bset) / sizeof(u64);
+ +}
+ +
+ +static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t)
+ +{
+ +      return bset_u64s(t) - b->nr.bset_u64s[t - b->set];
+ +}
+ +
+ +static inline unsigned bset_byte_offset(struct btree *b, void *i)
+ +{
+ +      return i - (void *) b->data;
+ +}
+ +
+ +enum btree_node_type {
+ +#define x(kwd, val, ...) BKEY_TYPE_##kwd = val,
+ +      BCH_BTREE_IDS()
+ +#undef x
+ +      BKEY_TYPE_btree,
+ +};
+ +
+ +/* Type of a key in btree @id at level @level: */
+ +static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
+ +{
+ +      return level ? BKEY_TYPE_btree : (enum btree_node_type) id;
+ +}
+ +
+ +/* Type of keys @b contains: */
+ +static inline enum btree_node_type btree_node_type(struct btree *b)
+ +{
+ +      return __btree_node_type(b->c.level, b->c.btree_id);
+ +}
+ +
+ +#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS            \
+ +      (BIT(BKEY_TYPE_extents)|                        \
+ +       BIT(BKEY_TYPE_alloc)|                          \
+ +       BIT(BKEY_TYPE_inodes)|                         \
+ +       BIT(BKEY_TYPE_stripes)|                        \
+ +       BIT(BKEY_TYPE_reflink)|                        \
+ +       BIT(BKEY_TYPE_btree))
+ +
+ +#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS              \
+ +      (BIT(BKEY_TYPE_alloc)|                          \
+ +       BIT(BKEY_TYPE_inodes)|                         \
+ +       BIT(BKEY_TYPE_stripes)|                        \
+ +       BIT(BKEY_TYPE_snapshots))
+ +
+ +#define BTREE_NODE_TYPE_HAS_TRIGGERS                  \
+ +      (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|            \
+ +       BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
+ +
+ +static inline bool btree_node_type_needs_gc(enum btree_node_type type)
+ +{
+ +      return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
+ +}
+ +
+ +static inline bool btree_node_type_is_extents(enum btree_node_type type)
+ +{
+ +      const unsigned mask = 0
+ +#define x(name, nr, flags, ...)       |((!!((flags) & BTREE_ID_EXTENTS)) << nr)
+ +      BCH_BTREE_IDS()
+ +#undef x
+ +      ;
+ +
+ +      return (1U << type) & mask;
+ +}
+ +
+ +static inline bool btree_id_is_extents(enum btree_id btree)
+ +{
+ +      return btree_node_type_is_extents((enum btree_node_type) btree);
+ +}
+ +
+ +static inline bool btree_type_has_snapshots(enum btree_id id)
+ +{
+ +      const unsigned mask = 0
+ +#define x(name, nr, flags, ...)       |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
+ +      BCH_BTREE_IDS()
+ +#undef x
+ +      ;
+ +
+ +      return (1U << id) & mask;
+ +}
+ +
+ +static inline bool btree_type_has_ptrs(enum btree_id id)
+ +{
+ +      const unsigned mask = 0
+ +#define x(name, nr, flags, ...)       |((!!((flags) & BTREE_ID_DATA)) << nr)
+ +      BCH_BTREE_IDS()
+ +#undef x
+ +      ;
+ +
+ +      return (1U << id) & mask;
+ +}
+ +
+ +struct btree_root {
+ +      struct btree            *b;
+ +
+ +      /* On disk root - see async splits: */
+ +      __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+ +      u8                      level;
+ +      u8                      alive;
+ +      s8                      error;
+ +};
+ +
+ +enum btree_gc_coalesce_fail_reason {
+ +      BTREE_GC_COALESCE_FAIL_RESERVE_GET,
+ +      BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
+ +      BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
+ +};
+ +
+ +enum btree_node_sibling {
+ +      btree_prev_sib,
+ +      btree_next_sib,
+ +};
+ +
+ +#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --cc fs/bcachefs/fs.c

index 6642b88c41a0e27379d189372d9a369d282c338d,0000000000000000000000000000000000000000..a2a5133fb6b5aec8a3023dacf24c717d416c745f

mode 100644,000000..100644
--- 1/fs/bcachefs/fs.c
--- /dev/null
+++ b/fs/bcachefs/fs.c
@@@ -1,1980 -1,0 +1,1980 @@@
-       sb->s_shrink.seeks = 0;
+ +// SPDX-License-Identifier: GPL-2.0
+ +#ifndef NO_BCACHEFS_FS
+ +
+ +#include "bcachefs.h"
+ +#include "acl.h"
+ +#include "bkey_buf.h"
+ +#include "btree_update.h"
+ +#include "buckets.h"
+ +#include "chardev.h"
+ +#include "dirent.h"
+ +#include "errcode.h"
+ +#include "extents.h"
+ +#include "fs.h"
+ +#include "fs-common.h"
+ +#include "fs-io.h"
+ +#include "fs-ioctl.h"
+ +#include "fs-io-buffered.h"
+ +#include "fs-io-direct.h"
+ +#include "fs-io-pagecache.h"
+ +#include "fsck.h"
+ +#include "inode.h"
+ +#include "io_read.h"
+ +#include "journal.h"
+ +#include "keylist.h"
+ +#include "quota.h"
+ +#include "snapshot.h"
+ +#include "super.h"
+ +#include "xattr.h"
+ +
+ +#include <linux/aio.h>
+ +#include <linux/backing-dev.h>
+ +#include <linux/exportfs.h>
+ +#include <linux/fiemap.h>
+ +#include <linux/module.h>
+ +#include <linux/pagemap.h>
+ +#include <linux/posix_acl.h>
+ +#include <linux/random.h>
+ +#include <linux/seq_file.h>
+ +#include <linux/statfs.h>
+ +#include <linux/string.h>
+ +#include <linux/xattr.h>
+ +
+ +static struct kmem_cache *bch2_inode_cache;
+ +
+ +static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
+ +                              struct bch_inode_info *,
+ +                              struct bch_inode_unpacked *,
+ +                              struct bch_subvolume *);
+ +
+ +void bch2_inode_update_after_write(struct btree_trans *trans,
+ +                                 struct bch_inode_info *inode,
+ +                                 struct bch_inode_unpacked *bi,
+ +                                 unsigned fields)
+ +{
+ +      struct bch_fs *c = trans->c;
+ +
+ +      BUG_ON(bi->bi_inum != inode->v.i_ino);
+ +
+ +      bch2_assert_pos_locked(trans, BTREE_ID_inodes,
+ +                             POS(0, bi->bi_inum),
+ +                             c->opts.inodes_use_key_cache);
+ +
+ +      set_nlink(&inode->v, bch2_inode_nlink_get(bi));
+ +      i_uid_write(&inode->v, bi->bi_uid);
+ +      i_gid_write(&inode->v, bi->bi_gid);
+ +      inode->v.i_mode = bi->bi_mode;
+ +
+ +      if (fields & ATTR_ATIME)
+ +              inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
+ +      if (fields & ATTR_MTIME)
+ +              inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
+ +      if (fields & ATTR_CTIME)
+ +              inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
+ +
+ +      inode->ei_inode         = *bi;
+ +
+ +      bch2_inode_flags_to_vfs(inode);
+ +}
+ +
+ +int __must_check bch2_write_inode(struct bch_fs *c,
+ +                                struct bch_inode_info *inode,
+ +                                inode_set_fn set,
+ +                                void *p, unsigned fields)
+ +{
+ +      struct btree_trans *trans = bch2_trans_get(c);
+ +      struct btree_iter iter = { NULL };
+ +      struct bch_inode_unpacked inode_u;
+ +      int ret;
+ +retry:
+ +      bch2_trans_begin(trans);
+ +
+ +      ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
+ +                              BTREE_ITER_INTENT) ?:
+ +              (set ? set(trans, inode, &inode_u, p) : 0) ?:
+ +              bch2_inode_write(trans, &iter, &inode_u) ?:
+ +              bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+ +
+ +      /*
+ +       * the btree node lock protects inode->ei_inode, not ei_update_lock;
+ +       * this is important for inode updates via bchfs_write_index_update
+ +       */
+ +      if (!ret)
+ +              bch2_inode_update_after_write(trans, inode, &inode_u, fields);
+ +
+ +      bch2_trans_iter_exit(trans, &iter);
+ +
+ +      if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ +              goto retry;
+ +
+ +      bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
+ +                           "inode %u:%llu not found when updating",
+ +                           inode_inum(inode).subvol,
+ +                           inode_inum(inode).inum);
+ +
+ +      bch2_trans_put(trans);
+ +      return ret < 0 ? ret : 0;
+ +}
+ +
+ +int bch2_fs_quota_transfer(struct bch_fs *c,
+ +                         struct bch_inode_info *inode,
+ +                         struct bch_qid new_qid,
+ +                         unsigned qtypes,
+ +                         enum quota_acct_mode mode)
+ +{
+ +      unsigned i;
+ +      int ret;
+ +
+ +      qtypes &= enabled_qtypes(c);
+ +
+ +      for (i = 0; i < QTYP_NR; i++)
+ +              if (new_qid.q[i] == inode->ei_qid.q[i])
+ +                      qtypes &= ~(1U << i);
+ +
+ +      if (!qtypes)
+ +              return 0;
+ +
+ +      mutex_lock(&inode->ei_quota_lock);
+ +
+ +      ret = bch2_quota_transfer(c, qtypes, new_qid,
+ +                                inode->ei_qid,
+ +                                inode->v.i_blocks +
+ +                                inode->ei_quota_reserved,
+ +                                mode);
+ +      if (!ret)
+ +              for (i = 0; i < QTYP_NR; i++)
+ +                      if (qtypes & (1 << i))
+ +                              inode->ei_qid.q[i] = new_qid.q[i];
+ +
+ +      mutex_unlock(&inode->ei_quota_lock);
+ +
+ +      return ret;
+ +}
+ +
+ +static int bch2_iget5_test(struct inode *vinode, void *p)
+ +{
+ +      struct bch_inode_info *inode = to_bch_ei(vinode);
+ +      subvol_inum *inum = p;
+ +
+ +      return inode->ei_subvol == inum->subvol &&
+ +              inode->ei_inode.bi_inum == inum->inum;
+ +}
+ +
+ +static int bch2_iget5_set(struct inode *vinode, void *p)
+ +{
+ +      struct bch_inode_info *inode = to_bch_ei(vinode);
+ +      subvol_inum *inum = p;
+ +
+ +      inode->v.i_ino          = inum->inum;
+ +      inode->ei_subvol        = inum->subvol;
+ +      inode->ei_inode.bi_inum = inum->inum;
+ +      return 0;
+ +}
+ +
+ +static unsigned bch2_inode_hash(subvol_inum inum)
+ +{
+ +      return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+ +}
+ +
+ +struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+ +{
+ +      struct bch_inode_unpacked inode_u;
+ +      struct bch_inode_info *inode;
+ +      struct btree_trans *trans;
+ +      struct bch_subvolume subvol;
+ +      int ret;
+ +
+ +      inode = to_bch_ei(iget5_locked(c->vfs_sb,
+ +                                     bch2_inode_hash(inum),
+ +                                     bch2_iget5_test,
+ +                                     bch2_iget5_set,
+ +                                     &inum));
+ +      if (unlikely(!inode))
+ +              return ERR_PTR(-ENOMEM);
+ +      if (!(inode->v.i_state & I_NEW))
+ +              return &inode->v;
+ +
+ +      trans = bch2_trans_get(c);
+ +      ret = lockrestart_do(trans,
+ +              bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+ +              bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
+ +
+ +      if (!ret)
+ +              bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+ +      bch2_trans_put(trans);
+ +
+ +      if (ret) {
+ +              iget_failed(&inode->v);
+ +              return ERR_PTR(bch2_err_class(ret));
+ +      }
+ +
+ +      mutex_lock(&c->vfs_inodes_lock);
+ +      list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+ +      mutex_unlock(&c->vfs_inodes_lock);
+ +
+ +      unlock_new_inode(&inode->v);
+ +
+ +      return &inode->v;
+ +}
+ +
+ +struct bch_inode_info *
+ +__bch2_create(struct mnt_idmap *idmap,
+ +            struct bch_inode_info *dir, struct dentry *dentry,
+ +            umode_t mode, dev_t rdev, subvol_inum snapshot_src,
+ +            unsigned flags)
+ +{
+ +      struct bch_fs *c = dir->v.i_sb->s_fs_info;
+ +      struct btree_trans *trans;
+ +      struct bch_inode_unpacked dir_u;
+ +      struct bch_inode_info *inode, *old;
+ +      struct bch_inode_unpacked inode_u;
+ +      struct posix_acl *default_acl = NULL, *acl = NULL;
+ +      subvol_inum inum;
+ +      struct bch_subvolume subvol;
+ +      u64 journal_seq = 0;
+ +      int ret;
+ +
+ +      /*
+ +       * preallocate acls + vfs inode before btree transaction, so that
+ +       * nothing can fail after the transaction succeeds:
+ +       */
+ +#ifdef CONFIG_BCACHEFS_POSIX_ACL
+ +      ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
+ +      if (ret)
+ +              return ERR_PTR(ret);
+ +#endif
+ +      inode = to_bch_ei(new_inode(c->vfs_sb));
+ +      if (unlikely(!inode)) {
+ +              inode = ERR_PTR(-ENOMEM);
+ +              goto err;
+ +      }
+ +
+ +      bch2_inode_init_early(c, &inode_u);
+ +
+ +      if (!(flags & BCH_CREATE_TMPFILE))
+ +              mutex_lock(&dir->ei_update_lock);
+ +
+ +      trans = bch2_trans_get(c);
+ +retry:
+ +      bch2_trans_begin(trans);
+ +
+ +      ret   = bch2_create_trans(trans,
+ +                                inode_inum(dir), &dir_u, &inode_u,
+ +                                !(flags & BCH_CREATE_TMPFILE)
+ +                                ? &dentry->d_name : NULL,
+ +                                from_kuid(i_user_ns(&dir->v), current_fsuid()),
+ +                                from_kgid(i_user_ns(&dir->v), current_fsgid()),
+ +                                mode, rdev,
+ +                                default_acl, acl, snapshot_src, flags) ?:
+ +              bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
+ +                              KEY_TYPE_QUOTA_PREALLOC);
+ +      if (unlikely(ret))
+ +              goto err_before_quota;
+ +
+ +      inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+ +      inum.inum = inode_u.bi_inum;
+ +
+ +      ret   = bch2_subvolume_get(trans, inum.subvol, true,
+ +                                 BTREE_ITER_WITH_UPDATES, &subvol) ?:
+ +              bch2_trans_commit(trans, NULL, &journal_seq, 0);
+ +      if (unlikely(ret)) {
+ +              bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
+ +                              KEY_TYPE_QUOTA_WARN);
+ +err_before_quota:
+ +              if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ +                      goto retry;
+ +              goto err_trans;
+ +      }
+ +
+ +      if (!(flags & BCH_CREATE_TMPFILE)) {
+ +              bch2_inode_update_after_write(trans, dir, &dir_u,
+ +                                            ATTR_MTIME|ATTR_CTIME);
+ +              mutex_unlock(&dir->ei_update_lock);
+ +      }
+ +
+ +      bch2_iget5_set(&inode->v, &inum);
+ +      bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+ +
+ +      set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+ +      set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
+ +
+ +      /*
+ +       * we must insert the new inode into the inode cache before calling
+ +       * bch2_trans_exit() and dropping locks, else we could race with another
+ +       * thread pulling the inode in and modifying it:
+ +       */
+ +
+ +      inode->v.i_state |= I_CREATING;
+ +
+ +      old = to_bch_ei(inode_insert5(&inode->v,
+ +                                    bch2_inode_hash(inum),
+ +                                    bch2_iget5_test,
+ +                                    bch2_iget5_set,
+ +                                    &inum));
+ +      BUG_ON(!old);
+ +
+ +      if (unlikely(old != inode)) {
+ +              /*
+ +               * We raced, another process pulled the new inode into cache
+ +               * before us:
+ +               */
+ +              make_bad_inode(&inode->v);
+ +              iput(&inode->v);
+ +
+ +              inode = old;
+ +      } else {
+ +              mutex_lock(&c->vfs_inodes_lock);
+ +              list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+ +              mutex_unlock(&c->vfs_inodes_lock);
+ +              /*
+ +               * we really don't want insert_inode_locked2() to be setting
+ +               * I_NEW...
+ +               */
+ +              unlock_new_inode(&inode->v);
+ +      }
+ +
+ +      bch2_trans_put(trans);
+ +err:
+ +      posix_acl_release(default_acl);
+ +      posix_acl_release(acl);
+ +      return inode;
+ +err_trans:
+ +      if (!(flags & BCH_CREATE_TMPFILE))
+ +              mutex_unlock(&dir->ei_update_lock);
+ +
+ +      bch2_trans_put(trans);
+ +      make_bad_inode(&inode->v);
+ +      iput(&inode->v);
+ +      inode = ERR_PTR(ret);
+ +      goto err;
+ +}
+ +
+ +/* methods */
+ +
+ +static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
+ +                                unsigned int flags)
+ +{
+ +      struct bch_fs *c = vdir->i_sb->s_fs_info;
+ +      struct bch_inode_info *dir = to_bch_ei(vdir);
+ +      struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
+ +      struct inode *vinode = NULL;
+ +      subvol_inum inum = { .subvol = 1 };
+ +      int ret;
+ +
+ +      ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
+ +                               &dentry->d_name, &inum);
+ +
+ +      if (!ret)
+ +              vinode = bch2_vfs_inode_get(c, inum);
+ +
+ +      return d_splice_alias(vinode, dentry);
+ +}
+ +
+ +static int bch2_mknod(struct mnt_idmap *idmap,
+ +                    struct inode *vdir, struct dentry *dentry,
+ +                    umode_t mode, dev_t rdev)
+ +{
+ +      struct bch_inode_info *inode =
+ +              __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
+ +                            (subvol_inum) { 0 }, 0);
+ +
+ +      if (IS_ERR(inode))
+ +              return bch2_err_class(PTR_ERR(inode));
+ +
+ +      d_instantiate(dentry, &inode->v);
+ +      return 0;
+ +}
+ +
+ +static int bch2_create(struct mnt_idmap *idmap,
+ +                     struct inode *vdir, struct dentry *dentry,
+ +                     umode_t mode, bool excl)
+ +{
+ +      return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
+ +}
+ +
+ +static int __bch2_link(struct bch_fs *c,
+ +                     struct bch_inode_info *inode,
+ +                     struct bch_inode_info *dir,
+ +                     struct dentry *dentry)
+ +{
+ +      struct btree_trans *trans = bch2_trans_get(c);
+ +      struct bch_inode_unpacked dir_u, inode_u;
+ +      int ret;
+ +
+ +      mutex_lock(&inode->ei_update_lock);
+ +
+ +      ret = commit_do(trans, NULL, NULL, 0,
+ +                      bch2_link_trans(trans,
+ +                                      inode_inum(dir),   &dir_u,
+ +                                      inode_inum(inode), &inode_u,
+ +                                      &dentry->d_name));
+ +
+ +      if (likely(!ret)) {
+ +              bch2_inode_update_after_write(trans, dir, &dir_u,
+ +                                            ATTR_MTIME|ATTR_CTIME);
+ +              bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
+ +      }
+ +
+ +      bch2_trans_put(trans);
+ +      mutex_unlock(&inode->ei_update_lock);
+ +      return ret;
+ +}
+ +
+ +static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
+ +                   struct dentry *dentry)
+ +{
+ +      struct bch_fs *c = vdir->i_sb->s_fs_info;
+ +      struct bch_inode_info *dir = to_bch_ei(vdir);
+ +      struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
+ +      int ret;
+ +
+ +      lockdep_assert_held(&inode->v.i_rwsem);
+ +
+ +      ret = __bch2_link(c, inode, dir, dentry);
+ +      if (unlikely(ret))
+ +              return ret;
+ +
+ +      ihold(&inode->v);
+ +      d_instantiate(dentry, &inode->v);
+ +      return 0;
+ +}
+ +
+ +int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
+ +                bool deleting_snapshot)
+ +{
+ +      struct bch_fs *c = vdir->i_sb->s_fs_info;
+ +      struct bch_inode_info *dir = to_bch_ei(vdir);
+ +      struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+ +      struct bch_inode_unpacked dir_u, inode_u;
+ +      struct btree_trans *trans = bch2_trans_get(c);
+ +      int ret;
+ +
+ +      bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
+ +
+ +      ret = commit_do(trans, NULL, NULL,
+ +                      BTREE_INSERT_NOFAIL,
+ +              bch2_unlink_trans(trans,
+ +                                inode_inum(dir), &dir_u,
+ +                                &inode_u, &dentry->d_name,
+ +                                deleting_snapshot));
+ +      if (unlikely(ret))
+ +              goto err;
+ +
+ +      bch2_inode_update_after_write(trans, dir, &dir_u,
+ +                                    ATTR_MTIME|ATTR_CTIME);
+ +      bch2_inode_update_after_write(trans, inode, &inode_u,
+ +                                    ATTR_MTIME);
+ +
+ +      if (inode_u.bi_subvol) {
+ +              /*
+ +               * Subvolume deletion is asynchronous, but we still want to tell
+ +               * the VFS that it's been deleted here:
+ +               */
+ +              set_nlink(&inode->v, 0);
+ +      }
+ +err:
+ +      bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
+ +      bch2_trans_put(trans);
+ +
+ +      return ret;
+ +}
+ +
+ +static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+ +{
+ +      return __bch2_unlink(vdir, dentry, false);
+ +}
+ +
+ +static int bch2_symlink(struct mnt_idmap *idmap,
+ +                      struct inode *vdir, struct dentry *dentry,
+ +                      const char *symname)
+ +{
+ +      struct bch_fs *c = vdir->i_sb->s_fs_info;
+ +      struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
+ +      int ret;
+ +
+ +      inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
+ +                            (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
+ +      if (IS_ERR(inode))
+ +              return bch2_err_class(PTR_ERR(inode));
+ +
+ +      inode_lock(&inode->v);
+ +      ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
+ +      inode_unlock(&inode->v);
+ +
+ +      if (unlikely(ret))
+ +              goto err;
+ +
+ +      ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
+ +      if (unlikely(ret))
+ +              goto err;
+ +
+ +      ret = __bch2_link(c, inode, dir, dentry);
+ +      if (unlikely(ret))
+ +              goto err;
+ +
+ +      d_instantiate(dentry, &inode->v);
+ +      return 0;
+ +err:
+ +      iput(&inode->v);
+ +      return ret;
+ +}
+ +
+ +static int bch2_mkdir(struct mnt_idmap *idmap,
+ +                    struct inode *vdir, struct dentry *dentry, umode_t mode)
+ +{
+ +      return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
+ +}
+ +
+ +static int bch2_rename2(struct mnt_idmap *idmap,
+ +                      struct inode *src_vdir, struct dentry *src_dentry,
+ +                      struct inode *dst_vdir, struct dentry *dst_dentry,
+ +                      unsigned flags)
+ +{
+ +      struct bch_fs *c = src_vdir->i_sb->s_fs_info;
+ +      struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
+ +      struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
+ +      struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
+ +      struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
+ +      struct bch_inode_unpacked dst_dir_u, src_dir_u;
+ +      struct bch_inode_unpacked src_inode_u, dst_inode_u;
+ +      struct btree_trans *trans;
+ +      enum bch_rename_mode mode = flags & RENAME_EXCHANGE
+ +              ? BCH_RENAME_EXCHANGE
+ +              : dst_dentry->d_inode
+ +              ? BCH_RENAME_OVERWRITE : BCH_RENAME;
+ +      int ret;
+ +
+ +      if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+ +              return -EINVAL;
+ +
+ +      if (mode == BCH_RENAME_OVERWRITE) {
+ +              ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
+ +                                                 0, LLONG_MAX);
+ +              if (ret)
+ +                      return ret;
+ +      }
+ +
+ +      trans = bch2_trans_get(c);
+ +
+ +      bch2_lock_inodes(INODE_UPDATE_LOCK,
+ +                       src_dir,
+ +                       dst_dir,
+ +                       src_inode,
+ +                       dst_inode);
+ +
+ +      if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
+ +              ret = bch2_fs_quota_transfer(c, src_inode,
+ +                                           dst_dir->ei_qid,
+ +                                           1 << QTYP_PRJ,
+ +                                           KEY_TYPE_QUOTA_PREALLOC);
+ +              if (ret)
+ +                      goto err;
+ +      }
+ +
+ +      if (mode == BCH_RENAME_EXCHANGE &&
+ +          inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
+ +              ret = bch2_fs_quota_transfer(c, dst_inode,
+ +                                           src_dir->ei_qid,
+ +                                           1 << QTYP_PRJ,
+ +                                           KEY_TYPE_QUOTA_PREALLOC);
+ +              if (ret)
+ +                      goto err;
+ +      }
+ +
+ +      ret = commit_do(trans, NULL, NULL, 0,
+ +                      bch2_rename_trans(trans,
+ +                                        inode_inum(src_dir), &src_dir_u,
+ +                                        inode_inum(dst_dir), &dst_dir_u,
+ +                                        &src_inode_u,
+ +                                        &dst_inode_u,
+ +                                        &src_dentry->d_name,
+ +                                        &dst_dentry->d_name,
+ +                                        mode));
+ +      if (unlikely(ret))
+ +              goto err;
+ +
+ +      BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
+ +      BUG_ON(dst_inode &&
+ +             dst_inode->v.i_ino != dst_inode_u.bi_inum);
+ +
+ +      bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
+ +                                    ATTR_MTIME|ATTR_CTIME);
+ +
+ +      if (src_dir != dst_dir)
+ +              bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
+ +                                            ATTR_MTIME|ATTR_CTIME);
+ +
+ +      bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
+ +                                    ATTR_CTIME);
+ +
+ +      if (dst_inode)
+ +              bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
+ +                                            ATTR_CTIME);
+ +err:
+ +      bch2_trans_put(trans);
+ +
+ +      bch2_fs_quota_transfer(c, src_inode,
+ +                             bch_qid(&src_inode->ei_inode),
+ +                             1 << QTYP_PRJ,
+ +                             KEY_TYPE_QUOTA_NOCHECK);
+ +      if (dst_inode)
+ +              bch2_fs_quota_transfer(c, dst_inode,
+ +                                     bch_qid(&dst_inode->ei_inode),
+ +                                     1 << QTYP_PRJ,
+ +                                     KEY_TYPE_QUOTA_NOCHECK);
+ +
+ +      bch2_unlock_inodes(INODE_UPDATE_LOCK,
+ +                         src_dir,
+ +                         dst_dir,
+ +                         src_inode,
+ +                         dst_inode);
+ +
+ +      return ret;
+ +}
+ +
+ +static void bch2_setattr_copy(struct mnt_idmap *idmap,
+ +                            struct bch_inode_info *inode,
+ +                            struct bch_inode_unpacked *bi,
+ +                            struct iattr *attr)
+ +{
+ +      struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ +      unsigned int ia_valid = attr->ia_valid;
+ +
+ +      if (ia_valid & ATTR_UID)
+ +              bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
+ +      if (ia_valid & ATTR_GID)
+ +              bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+ +
+ +      if (ia_valid & ATTR_SIZE)
+ +              bi->bi_size = attr->ia_size;
+ +
+ +      if (ia_valid & ATTR_ATIME)
+ +              bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
+ +      if (ia_valid & ATTR_MTIME)
+ +              bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
+ +      if (ia_valid & ATTR_CTIME)
+ +              bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
+ +
+ +      if (ia_valid & ATTR_MODE) {
+ +              umode_t mode = attr->ia_mode;
+ +              kgid_t gid = ia_valid & ATTR_GID
+ +                      ? attr->ia_gid
+ +                      : inode->v.i_gid;
+ +
+ +              if (!in_group_p(gid) &&
+ +                  !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
+ +                      mode &= ~S_ISGID;
+ +              bi->bi_mode = mode;
+ +      }
+ +}
+ +
+ +int bch2_setattr_nonsize(struct mnt_idmap *idmap,
+ +                       struct bch_inode_info *inode,
+ +                       struct iattr *attr)
+ +{
+ +      struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ +      struct bch_qid qid;
+ +      struct btree_trans *trans;
+ +      struct btree_iter inode_iter = { NULL };
+ +      struct bch_inode_unpacked inode_u;
+ +      struct posix_acl *acl = NULL;
+ +      int ret;
+ +
+ +      mutex_lock(&inode->ei_update_lock);
+ +
+ +      qid = inode->ei_qid;
+ +
+ +      if (attr->ia_valid & ATTR_UID)
+ +              qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
+ +
+ +      if (attr->ia_valid & ATTR_GID)
+ +              qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+ +
+ +      ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
+ +                                   KEY_TYPE_QUOTA_PREALLOC);
+ +      if (ret)
+ +              goto err;
+ +
+ +      trans = bch2_trans_get(c);
+ +retry:
+ +      bch2_trans_begin(trans);
+ +      kfree(acl);
+ +      acl = NULL;
+ +
+ +      ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
+ +                            BTREE_ITER_INTENT);
+ +      if (ret)
+ +              goto btree_err;
+ +
+ +      bch2_setattr_copy(idmap, inode, &inode_u, attr);
+ +
+ +      if (attr->ia_valid & ATTR_MODE) {
+ +              ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
+ +                                   inode_u.bi_mode, &acl);
+ +              if (ret)
+ +                      goto btree_err;
+ +      }
+ +
+ +      ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+ +              bch2_trans_commit(trans, NULL, NULL,
+ +                                BTREE_INSERT_NOFAIL);
+ +btree_err:
+ +      bch2_trans_iter_exit(trans, &inode_iter);
+ +
+ +      if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ +              goto retry;
+ +      if (unlikely(ret))
+ +              goto err_trans;
+ +
+ +      bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
+ +
+ +      if (acl)
+ +              set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+ +err_trans:
+ +      bch2_trans_put(trans);
+ +err:
+ +      mutex_unlock(&inode->ei_update_lock);
+ +
+ +      return bch2_err_class(ret);
+ +}
+ +
+ +static int bch2_getattr(struct mnt_idmap *idmap,
+ +                      const struct path *path, struct kstat *stat,
+ +                      u32 request_mask, unsigned query_flags)
+ +{
+ +      struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
+ +      struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ +
+ +      stat->dev       = inode->v.i_sb->s_dev;
+ +      stat->ino       = inode->v.i_ino;
+ +      stat->mode      = inode->v.i_mode;
+ +      stat->nlink     = inode->v.i_nlink;
+ +      stat->uid       = inode->v.i_uid;
+ +      stat->gid       = inode->v.i_gid;
+ +      stat->rdev      = inode->v.i_rdev;
+ +      stat->size      = i_size_read(&inode->v);
+ +      stat->atime     = inode_get_atime(&inode->v);
+ +      stat->mtime     = inode_get_mtime(&inode->v);
+ +      stat->ctime     = inode_get_ctime(&inode->v);
+ +      stat->blksize   = block_bytes(c);
+ +      stat->blocks    = inode->v.i_blocks;
+ +
+ +      if (request_mask & STATX_BTIME) {
+ +              stat->result_mask |= STATX_BTIME;
+ +              stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
+ +      }
+ +
+ +      if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
+ +              stat->attributes |= STATX_ATTR_IMMUTABLE;
+ +      stat->attributes_mask    |= STATX_ATTR_IMMUTABLE;
+ +
+ +      if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
+ +              stat->attributes |= STATX_ATTR_APPEND;
+ +      stat->attributes_mask    |= STATX_ATTR_APPEND;
+ +
+ +      if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
+ +              stat->attributes |= STATX_ATTR_NODUMP;
+ +      stat->attributes_mask    |= STATX_ATTR_NODUMP;
+ +
+ +      return 0;
+ +}
+ +
+ +static int bch2_setattr(struct mnt_idmap *idmap,
+ +                      struct dentry *dentry, struct iattr *iattr)
+ +{
+ +      struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+ +      int ret;
+ +
+ +      lockdep_assert_held(&inode->v.i_rwsem);
+ +
+ +      ret = setattr_prepare(idmap, dentry, iattr);
+ +      if (ret)
+ +              return ret;
+ +
+ +      return iattr->ia_valid & ATTR_SIZE
+ +              ? bchfs_truncate(idmap, inode, iattr)
+ +              : bch2_setattr_nonsize(idmap, inode, iattr);
+ +}
+ +
+ +static int bch2_tmpfile(struct mnt_idmap *idmap,
+ +                      struct inode *vdir, struct file *file, umode_t mode)
+ +{
+ +      struct bch_inode_info *inode =
+ +              __bch2_create(idmap, to_bch_ei(vdir),
+ +                            file->f_path.dentry, mode, 0,
+ +                            (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
+ +
+ +      if (IS_ERR(inode))
+ +              return bch2_err_class(PTR_ERR(inode));
+ +
+ +      d_mark_tmpfile(file, &inode->v);
+ +      d_instantiate(file->f_path.dentry, &inode->v);
+ +      return finish_open_simple(file, 0);
+ +}
+ +
+ +static int bch2_fill_extent(struct bch_fs *c,
+ +                          struct fiemap_extent_info *info,
+ +                          struct bkey_s_c k, unsigned flags)
+ +{
+ +      if (bkey_extent_is_direct_data(k.k)) {
+ +              struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ +              const union bch_extent_entry *entry;
+ +              struct extent_ptr_decoded p;
+ +              int ret;
+ +
+ +              if (k.k->type == KEY_TYPE_reflink_v)
+ +                      flags |= FIEMAP_EXTENT_SHARED;
+ +
+ +              bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ +                      int flags2 = 0;
+ +                      u64 offset = p.ptr.offset;
+ +
+ +                      if (p.ptr.unwritten)
+ +                              flags2 |= FIEMAP_EXTENT_UNWRITTEN;
+ +
+ +                      if (p.crc.compression_type)
+ +                              flags2 |= FIEMAP_EXTENT_ENCODED;
+ +                      else
+ +                              offset += p.crc.offset;
+ +
+ +                      if ((offset & (block_sectors(c) - 1)) ||
+ +                          (k.k->size & (block_sectors(c) - 1)))
+ +                              flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
+ +
+ +                      ret = fiemap_fill_next_extent(info,
+ +                                              bkey_start_offset(k.k) << 9,
+ +                                              offset << 9,
+ +                                              k.k->size << 9, flags|flags2);
+ +                      if (ret)
+ +                              return ret;
+ +              }
+ +
+ +              return 0;
+ +      } else if (bkey_extent_is_inline_data(k.k)) {
+ +              return fiemap_fill_next_extent(info,
+ +                                             bkey_start_offset(k.k) << 9,
+ +                                             0, k.k->size << 9,
+ +                                             flags|
+ +                                             FIEMAP_EXTENT_DATA_INLINE);
+ +      } else if (k.k->type == KEY_TYPE_reservation) {
+ +              return fiemap_fill_next_extent(info,
+ +                                             bkey_start_offset(k.k) << 9,
+ +                                             0, k.k->size << 9,
+ +                                             flags|
+ +                                             FIEMAP_EXTENT_DELALLOC|
+ +                                             FIEMAP_EXTENT_UNWRITTEN);
+ +      } else {
+ +              BUG();
+ +      }
+ +}
+ +
+ +static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
+ +                     u64 start, u64 len)
+ +{
+ +      struct bch_fs *c = vinode->i_sb->s_fs_info;
+ +      struct bch_inode_info *ei = to_bch_ei(vinode);
+ +      struct btree_trans *trans;
+ +      struct btree_iter iter;
+ +      struct bkey_s_c k;
+ +      struct bkey_buf cur, prev;
+ +      struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
+ +      unsigned offset_into_extent, sectors;
+ +      bool have_extent = false;
+ +      u32 snapshot;
+ +      int ret = 0;
+ +
+ +      ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
+ +      if (ret)
+ +              return ret;
+ +
+ +      if (start + len < start)
+ +              return -EINVAL;
+ +
+ +      start >>= 9;
+ +
+ +      bch2_bkey_buf_init(&cur);
+ +      bch2_bkey_buf_init(&prev);
+ +      trans = bch2_trans_get(c);
+ +retry:
+ +      bch2_trans_begin(trans);
+ +
+ +      ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
+ +      if (ret)
+ +              goto err;
+ +
+ +      bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+ +                           SPOS(ei->v.i_ino, start, snapshot), 0);
+ +
+ +      while (!(ret = btree_trans_too_many_iters(trans)) &&
+ +             (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
+ +             !(ret = bkey_err(k))) {
+ +              enum btree_id data_btree = BTREE_ID_extents;
+ +
+ +              if (!bkey_extent_is_data(k.k) &&
+ +                  k.k->type != KEY_TYPE_reservation) {
+ +                      bch2_btree_iter_advance(&iter);
+ +                      continue;
+ +              }
+ +
+ +              offset_into_extent      = iter.pos.offset -
+ +                      bkey_start_offset(k.k);
+ +              sectors                 = k.k->size - offset_into_extent;
+ +
+ +              bch2_bkey_buf_reassemble(&cur, c, k);
+ +
+ +              ret = bch2_read_indirect_extent(trans, &data_btree,
+ +                                      &offset_into_extent, &cur);
+ +              if (ret)
+ +                      break;
+ +
+ +              k = bkey_i_to_s_c(cur.k);
+ +              bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
+ +
+ +              sectors = min(sectors, k.k->size - offset_into_extent);
+ +
+ +              bch2_cut_front(POS(k.k->p.inode,
+ +                                 bkey_start_offset(k.k) +
+ +                                 offset_into_extent),
+ +                             cur.k);
+ +              bch2_key_resize(&cur.k->k, sectors);
+ +              cur.k->k.p = iter.pos;
+ +              cur.k->k.p.offset += cur.k->k.size;
+ +
+ +              if (have_extent) {
+ +                      bch2_trans_unlock(trans);
+ +                      ret = bch2_fill_extent(c, info,
+ +                                      bkey_i_to_s_c(prev.k), 0);
+ +                      if (ret)
+ +                              break;
+ +              }
+ +
+ +              bkey_copy(prev.k, cur.k);
+ +              have_extent = true;
+ +
+ +              bch2_btree_iter_set_pos(&iter,
+ +                      POS(iter.pos.inode, iter.pos.offset + sectors));
+ +      }
+ +      start = iter.pos.offset;
+ +      bch2_trans_iter_exit(trans, &iter);
+ +err:
+ +      if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ +              goto retry;
+ +
+ +      if (!ret && have_extent) {
+ +              bch2_trans_unlock(trans);
+ +              ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
+ +                                     FIEMAP_EXTENT_LAST);
+ +      }
+ +
+ +      bch2_trans_put(trans);
+ +      bch2_bkey_buf_exit(&cur, c);
+ +      bch2_bkey_buf_exit(&prev, c);
+ +      return ret < 0 ? ret : 0;
+ +}
+ +
+ +static const struct vm_operations_struct bch_vm_ops = {
+ +      .fault          = bch2_page_fault,
+ +      .map_pages      = filemap_map_pages,
+ +      .page_mkwrite   = bch2_page_mkwrite,
+ +};
+ +
+ +static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
+ +{
+ +      file_accessed(file);
+ +
+ +      vma->vm_ops = &bch_vm_ops;
+ +      return 0;
+ +}
+ +
+ +/* Directories: */
+ +
+ +static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
+ +{
+ +      return generic_file_llseek_size(file, offset, whence,
+ +                                      S64_MAX, S64_MAX);
+ +}
+ +
+ +static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
+ +{
+ +      struct bch_inode_info *inode = file_bch_inode(file);
+ +      struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ +      int ret;
+ +
+ +      if (!dir_emit_dots(file, ctx))
+ +              return 0;
+ +
+ +      ret = bch2_readdir(c, inode_inum(inode), ctx);
+ +      if (ret)
+ +              bch_err_fn(c, ret);
+ +
+ +      return bch2_err_class(ret);
+ +}
+ +
+ +static const struct file_operations bch_file_operations = {
+ +      .llseek         = bch2_llseek,
+ +      .read_iter      = bch2_read_iter,
+ +      .write_iter     = bch2_write_iter,
+ +      .mmap           = bch2_mmap,
+ +      .open           = generic_file_open,
+ +      .fsync          = bch2_fsync,
+ +      .splice_read    = filemap_splice_read,
+ +      .splice_write   = iter_file_splice_write,
+ +      .fallocate      = bch2_fallocate_dispatch,
+ +      .unlocked_ioctl = bch2_fs_file_ioctl,
+ +#ifdef CONFIG_COMPAT
+ +      .compat_ioctl   = bch2_compat_fs_ioctl,
+ +#endif
+ +      .remap_file_range = bch2_remap_file_range,
+ +};
+ +
+ +static const struct inode_operations bch_file_inode_operations = {
+ +      .getattr        = bch2_getattr,
+ +      .setattr        = bch2_setattr,
+ +      .fiemap         = bch2_fiemap,
+ +      .listxattr      = bch2_xattr_list,
+ +#ifdef CONFIG_BCACHEFS_POSIX_ACL
+ +      .get_acl        = bch2_get_acl,
+ +      .set_acl        = bch2_set_acl,
+ +#endif
+ +};
+ +
+ +static const struct inode_operations bch_dir_inode_operations = {
+ +      .lookup         = bch2_lookup,
+ +      .create         = bch2_create,
+ +      .link           = bch2_link,
+ +      .unlink         = bch2_unlink,
+ +      .symlink        = bch2_symlink,
+ +      .mkdir          = bch2_mkdir,
+ +      .rmdir          = bch2_unlink,
+ +      .mknod          = bch2_mknod,
+ +      .rename         = bch2_rename2,
+ +      .getattr        = bch2_getattr,
+ +      .setattr        = bch2_setattr,
+ +      .tmpfile        = bch2_tmpfile,
+ +      .listxattr      = bch2_xattr_list,
+ +#ifdef CONFIG_BCACHEFS_POSIX_ACL
+ +      .get_acl        = bch2_get_acl,
+ +      .set_acl        = bch2_set_acl,
+ +#endif
+ +};
+ +
+ +static const struct file_operations bch_dir_file_operations = {
+ +      .llseek         = bch2_dir_llseek,
+ +      .read           = generic_read_dir,
+ +      .iterate_shared = bch2_vfs_readdir,
+ +      .fsync          = bch2_fsync,
+ +      .unlocked_ioctl = bch2_fs_file_ioctl,
+ +#ifdef CONFIG_COMPAT
+ +      .compat_ioctl   = bch2_compat_fs_ioctl,
+ +#endif
+ +};
+ +
+ +static const struct inode_operations bch_symlink_inode_operations = {
+ +      .get_link       = page_get_link,
+ +      .getattr        = bch2_getattr,
+ +      .setattr        = bch2_setattr,
+ +      .listxattr      = bch2_xattr_list,
+ +#ifdef CONFIG_BCACHEFS_POSIX_ACL
+ +      .get_acl        = bch2_get_acl,
+ +      .set_acl        = bch2_set_acl,
+ +#endif
+ +};
+ +
+ +static const struct inode_operations bch_special_inode_operations = {
+ +      .getattr        = bch2_getattr,
+ +      .setattr        = bch2_setattr,
+ +      .listxattr      = bch2_xattr_list,
+ +#ifdef CONFIG_BCACHEFS_POSIX_ACL
+ +      .get_acl        = bch2_get_acl,
+ +      .set_acl        = bch2_set_acl,
+ +#endif
+ +};
+ +
+ +static const struct address_space_operations bch_address_space_operations = {
+ +      .read_folio     = bch2_read_folio,
+ +      .writepages     = bch2_writepages,
+ +      .readahead      = bch2_readahead,
+ +      .dirty_folio    = filemap_dirty_folio,
+ +      .write_begin    = bch2_write_begin,
+ +      .write_end      = bch2_write_end,
+ +      .invalidate_folio = bch2_invalidate_folio,
+ +      .release_folio  = bch2_release_folio,
+ +      .direct_IO      = noop_direct_IO,
+ +#ifdef CONFIG_MIGRATION
+ +      .migrate_folio  = filemap_migrate_folio,
+ +#endif
+ +      .error_remove_page = generic_error_remove_page,
+ +};
+ +
+ +struct bcachefs_fid {
+ +      u64             inum;
+ +      u32             subvol;
+ +      u32             gen;
+ +} __packed;
+ +
+ +struct bcachefs_fid_with_parent {
+ +      struct bcachefs_fid     fid;
+ +      struct bcachefs_fid     dir;
+ +} __packed;
+ +
+ +static int bcachefs_fid_valid(int fh_len, int fh_type)
+ +{
+ +      switch (fh_type) {
+ +      case FILEID_BCACHEFS_WITHOUT_PARENT:
+ +              return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
+ +      case FILEID_BCACHEFS_WITH_PARENT:
+ +              return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
+ +      default:
+ +              return false;
+ +      }
+ +}
+ +
+ +static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
+ +{
+ +      return (struct bcachefs_fid) {
+ +              .inum   = inode->ei_inode.bi_inum,
+ +              .subvol = inode->ei_subvol,
+ +              .gen    = inode->ei_inode.bi_generation,
+ +      };
+ +}
+ +
+ +static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
+ +                        struct inode *vdir)
+ +{
+ +      struct bch_inode_info *inode    = to_bch_ei(vinode);
+ +      struct bch_inode_info *dir      = to_bch_ei(vdir);
+ +
+ +      if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32))
+ +              return FILEID_INVALID;
+ +
+ +      if (!S_ISDIR(inode->v.i_mode) && dir) {
+ +              struct bcachefs_fid_with_parent *fid = (void *) fh;
+ +
+ +              fid->fid = bch2_inode_to_fid(inode);
+ +              fid->dir = bch2_inode_to_fid(dir);
+ +
+ +              *len = sizeof(*fid) / sizeof(u32);
+ +              return FILEID_BCACHEFS_WITH_PARENT;
+ +      } else {
+ +              struct bcachefs_fid *fid = (void *) fh;
+ +
+ +              *fid = bch2_inode_to_fid(inode);
+ +
+ +              *len = sizeof(*fid) / sizeof(u32);
+ +              return FILEID_BCACHEFS_WITHOUT_PARENT;
+ +      }
+ +}
+ +
+ +static struct inode *bch2_nfs_get_inode(struct super_block *sb,
+ +                                      struct bcachefs_fid fid)
+ +{
+ +      struct bch_fs *c = sb->s_fs_info;
+ +      struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
+ +                                  .subvol = fid.subvol,
+ +                                  .inum = fid.inum,
+ +      });
+ +      if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
+ +              iput(vinode);
+ +              vinode = ERR_PTR(-ESTALE);
+ +      }
+ +      return vinode;
+ +}
+ +
+ +static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
+ +              int fh_len, int fh_type)
+ +{
+ +      struct bcachefs_fid *fid = (void *) _fid;
+ +
+ +      if (!bcachefs_fid_valid(fh_len, fh_type))
+ +              return NULL;
+ +
+ +      return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
+ +}
+ +
+ +static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
+ +              int fh_len, int fh_type)
+ +{
+ +      struct bcachefs_fid_with_parent *fid = (void *) _fid;
+ +
+ +      if (!bcachefs_fid_valid(fh_len, fh_type) ||
+ +          fh_type != FILEID_BCACHEFS_WITH_PARENT)
+ +              return NULL;
+ +
+ +      return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
+ +}
+ +
+ +static struct dentry *bch2_get_parent(struct dentry *child)
+ +{
+ +      struct bch_inode_info *inode = to_bch_ei(child->d_inode);
+ +      struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ +      subvol_inum parent_inum = {
+ +              .subvol = inode->ei_inode.bi_parent_subvol ?:
+ +                      inode->ei_subvol,
+ +              .inum = inode->ei_inode.bi_dir,
+ +      };
+ +
+ +      if (!parent_inum.inum)
+ +              return NULL;
+ +
+ +      return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
+ +}
+ +
+ +static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
+ +{
+ +      struct bch_inode_info *inode    = to_bch_ei(child->d_inode);
+ +      struct bch_inode_info *dir      = to_bch_ei(parent->d_inode);
+ +      struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ +      struct btree_trans *trans;
+ +      struct btree_iter iter1;
+ +      struct btree_iter iter2;
+ +      struct bkey_s_c k;
+ +      struct bkey_s_c_dirent d;
+ +      struct bch_inode_unpacked inode_u;
+ +      subvol_inum target;
+ +      u32 snapshot;
+ +      struct qstr dirent_name;
+ +      unsigned name_len = 0;
+ +      int ret;
+ +
+ +      if (!S_ISDIR(dir->v.i_mode))
+ +              return -EINVAL;
+ +
+ +      trans = bch2_trans_get(c);
+ +
+ +      bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
+ +                           POS(dir->ei_inode.bi_inum, 0), 0);
+ +      bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
+ +                           POS(dir->ei_inode.bi_inum, 0), 0);
+ +retry:
+ +      bch2_trans_begin(trans);
+ +
+ +      ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
+ +      if (ret)
+ +              goto err;
+ +
+ +      bch2_btree_iter_set_snapshot(&iter1, snapshot);
+ +      bch2_btree_iter_set_snapshot(&iter2, snapshot);
+ +
+ +      ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
+ +      if (ret)
+ +              goto err;
+ +
+ +      if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
+ +              bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
+ +
+ +              k = bch2_btree_iter_peek_slot(&iter1);
+ +              ret = bkey_err(k);
+ +              if (ret)
+ +                      goto err;
+ +
+ +              if (k.k->type != KEY_TYPE_dirent) {
+ +                      ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+ +                      goto err;
+ +              }
+ +
+ +              d = bkey_s_c_to_dirent(k);
+ +              ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
+ +              if (ret > 0)
+ +                      ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+ +              if (ret)
+ +                      goto err;
+ +
+ +              if (target.subvol       == inode->ei_subvol &&
+ +                  target.inum         == inode->ei_inode.bi_inum)
+ +                      goto found;
+ +      } else {
+ +              /*
+ +               * File with multiple hardlinks and our backref is to the wrong
+ +               * directory - linear search:
+ +               */
+ +              for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
+ +                      if (k.k->p.inode > dir->ei_inode.bi_inum)
+ +                              break;
+ +
+ +                      if (k.k->type != KEY_TYPE_dirent)
+ +                              continue;
+ +
+ +                      d = bkey_s_c_to_dirent(k);
+ +                      ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
+ +                      if (ret < 0)
+ +                              break;
+ +                      if (ret)
+ +                              continue;
+ +
+ +                      if (target.subvol       == inode->ei_subvol &&
+ +                          target.inum         == inode->ei_inode.bi_inum)
+ +                              goto found;
+ +              }
+ +      }
+ +
+ +      ret = -ENOENT;
+ +      goto err;
+ +found:
+ +      dirent_name = bch2_dirent_get_name(d);
+ +
+ +      name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
+ +      memcpy(name, dirent_name.name, name_len);
+ +      name[name_len] = '\0';
+ +err:
+ +      if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ +              goto retry;
+ +
+ +      bch2_trans_iter_exit(trans, &iter1);
+ +      bch2_trans_iter_exit(trans, &iter2);
+ +      bch2_trans_put(trans);
+ +
+ +      return ret;
+ +}
+ +
+ +static const struct export_operations bch_export_ops = {
+ +      .encode_fh      = bch2_encode_fh,
+ +      .fh_to_dentry   = bch2_fh_to_dentry,
+ +      .fh_to_parent   = bch2_fh_to_parent,
+ +      .get_parent     = bch2_get_parent,
+ +      .get_name       = bch2_get_name,
+ +};
+ +
+ +static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
+ +                              struct bch_inode_info *inode,
+ +                              struct bch_inode_unpacked *bi,
+ +                              struct bch_subvolume *subvol)
+ +{
+ +      bch2_inode_update_after_write(trans, inode, bi, ~0);
+ +
+ +      if (BCH_SUBVOLUME_SNAP(subvol))
+ +              set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+ +      else
+ +              clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+ +
+ +      inode->v.i_blocks       = bi->bi_sectors;
+ +      inode->v.i_ino          = bi->bi_inum;
+ +      inode->v.i_rdev         = bi->bi_dev;
+ +      inode->v.i_generation   = bi->bi_generation;
+ +      inode->v.i_size         = bi->bi_size;
+ +
+ +      inode->ei_flags         = 0;
+ +      inode->ei_quota_reserved = 0;
+ +      inode->ei_qid           = bch_qid(bi);
+ +      inode->ei_subvol        = inum.subvol;
+ +
+ +      inode->v.i_mapping->a_ops = &bch_address_space_operations;
+ +
+ +      switch (inode->v.i_mode & S_IFMT) {
+ +      case S_IFREG:
+ +              inode->v.i_op   = &bch_file_inode_operations;
+ +              inode->v.i_fop  = &bch_file_operations;
+ +              break;
+ +      case S_IFDIR:
+ +              inode->v.i_op   = &bch_dir_inode_operations;
+ +              inode->v.i_fop  = &bch_dir_file_operations;
+ +              break;
+ +      case S_IFLNK:
+ +              inode_nohighmem(&inode->v);
+ +              inode->v.i_op   = &bch_symlink_inode_operations;
+ +              break;
+ +      default:
+ +              init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
+ +              inode->v.i_op   = &bch_special_inode_operations;
+ +              break;
+ +      }
+ +
+ +      mapping_set_large_folios(inode->v.i_mapping);
+ +}
+ +
+ +static struct inode *bch2_alloc_inode(struct super_block *sb)
+ +{
+ +      struct bch_inode_info *inode;
+ +
+ +      inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
+ +      if (!inode)
+ +              return NULL;
+ +
+ +      inode_init_once(&inode->v);
+ +      mutex_init(&inode->ei_update_lock);
+ +      two_state_lock_init(&inode->ei_pagecache_lock);
+ +      INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
+ +      mutex_init(&inode->ei_quota_lock);
+ +
+ +      return &inode->v;
+ +}
+ +
+ +static void bch2_i_callback(struct rcu_head *head)
+ +{
+ +      struct inode *vinode = container_of(head, struct inode, i_rcu);
+ +      struct bch_inode_info *inode = to_bch_ei(vinode);
+ +
+ +      kmem_cache_free(bch2_inode_cache, inode);
+ +}
+ +
+ +static void bch2_destroy_inode(struct inode *vinode)
+ +{
+ +      call_rcu(&vinode->i_rcu, bch2_i_callback);
+ +}
+ +
+ +static int inode_update_times_fn(struct btree_trans *trans,
+ +                               struct bch_inode_info *inode,
+ +                               struct bch_inode_unpacked *bi,
+ +                               void *p)
+ +{
+ +      struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ +
+ +      bi->bi_atime    = timespec_to_bch2_time(c, inode_get_atime(&inode->v));
+ +      bi->bi_mtime    = timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
+ +      bi->bi_ctime    = timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
+ +
+ +      return 0;
+ +}
+ +
+ +static int bch2_vfs_write_inode(struct inode *vinode,
+ +                              struct writeback_control *wbc)
+ +{
+ +      struct bch_fs *c = vinode->i_sb->s_fs_info;
+ +      struct bch_inode_info *inode = to_bch_ei(vinode);
+ +      int ret;
+ +
+ +      mutex_lock(&inode->ei_update_lock);
+ +      ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+ +                             ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
+ +      mutex_unlock(&inode->ei_update_lock);
+ +
+ +      return bch2_err_class(ret);
+ +}
+ +
+ +static void bch2_evict_inode(struct inode *vinode)
+ +{
+ +      struct bch_fs *c = vinode->i_sb->s_fs_info;
+ +      struct bch_inode_info *inode = to_bch_ei(vinode);
+ +
+ +      truncate_inode_pages_final(&inode->v.i_data);
+ +
+ +      clear_inode(&inode->v);
+ +
+ +      BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
+ +
+ +      if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
+ +              bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
+ +                              KEY_TYPE_QUOTA_WARN);
+ +              bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
+ +                              KEY_TYPE_QUOTA_WARN);
+ +              bch2_inode_rm(c, inode_inum(inode));
+ +      }
+ +
+ +      mutex_lock(&c->vfs_inodes_lock);
+ +      list_del_init(&inode->ei_vfs_inode_list);
+ +      mutex_unlock(&c->vfs_inodes_lock);
+ +}
+ +
+ +void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
+ +{
+ +      struct bch_inode_info *inode, **i;
+ +      DARRAY(struct bch_inode_info *) grabbed;
+ +      bool clean_pass = false, this_pass_clean;
+ +
+ +      /*
+ +       * Initially, we scan for inodes without I_DONTCACHE, then mark them to
+ +       * be pruned with d_mark_dontcache().
+ +       *
+ +       * Once we've had a clean pass where we didn't find any inodes without
+ +       * I_DONTCACHE, we wait for them to be freed:
+ +       */
+ +
+ +      darray_init(&grabbed);
+ +      darray_make_room(&grabbed, 1024);
+ +again:
+ +      cond_resched();
+ +      this_pass_clean = true;
+ +
+ +      mutex_lock(&c->vfs_inodes_lock);
+ +      list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
+ +              if (!snapshot_list_has_id(s, inode->ei_subvol))
+ +                      continue;
+ +
+ +              if (!(inode->v.i_state & I_DONTCACHE) &&
+ +                  !(inode->v.i_state & I_FREEING) &&
+ +                  igrab(&inode->v)) {
+ +                      this_pass_clean = false;
+ +
+ +                      if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
+ +                              iput(&inode->v);
+ +                              break;
+ +                      }
+ +              } else if (clean_pass && this_pass_clean) {
+ +                      wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
+ +                      DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
+ +
+ +                      prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ +                      mutex_unlock(&c->vfs_inodes_lock);
+ +
+ +                      schedule();
+ +                      finish_wait(wq, &wait.wq_entry);
+ +                      goto again;
+ +              }
+ +      }
+ +      mutex_unlock(&c->vfs_inodes_lock);
+ +
+ +      darray_for_each(grabbed, i) {
+ +              inode = *i;
+ +              d_mark_dontcache(&inode->v);
+ +              d_prune_aliases(&inode->v);
+ +              iput(&inode->v);
+ +      }
+ +      grabbed.nr = 0;
+ +
+ +      if (!clean_pass || !this_pass_clean) {
+ +              clean_pass = this_pass_clean;
+ +              goto again;
+ +      }
+ +
+ +      darray_exit(&grabbed);
+ +}
+ +
+ +static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
+ +{
+ +      struct super_block *sb = dentry->d_sb;
+ +      struct bch_fs *c = sb->s_fs_info;
+ +      struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
+ +      unsigned shift = sb->s_blocksize_bits - 9;
+ +      /*
+ +       * this assumes inodes take up 64 bytes, which is a decent average
+ +       * number:
+ +       */
+ +      u64 avail_inodes = ((usage.capacity - usage.used) << 3);
+ +      u64 fsid;
+ +
+ +      buf->f_type     = BCACHEFS_STATFS_MAGIC;
+ +      buf->f_bsize    = sb->s_blocksize;
+ +      buf->f_blocks   = usage.capacity >> shift;
+ +      buf->f_bfree    = usage.free >> shift;
+ +      buf->f_bavail   = avail_factor(usage.free) >> shift;
+ +
+ +      buf->f_files    = usage.nr_inodes + avail_inodes;
+ +      buf->f_ffree    = avail_inodes;
+ +
+ +      fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
+ +             le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
+ +      buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+ +      buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+ +      buf->f_namelen  = BCH_NAME_MAX;
+ +
+ +      return 0;
+ +}
+ +
+ +static int bch2_sync_fs(struct super_block *sb, int wait)
+ +{
+ +      struct bch_fs *c = sb->s_fs_info;
+ +      int ret;
+ +
+ +      if (c->opts.journal_flush_disabled)
+ +              return 0;
+ +
+ +      if (!wait) {
+ +              bch2_journal_flush_async(&c->journal, NULL);
+ +              return 0;
+ +      }
+ +
+ +      ret = bch2_journal_flush(&c->journal);
+ +      return bch2_err_class(ret);
+ +}
+ +
+ +static struct bch_fs *bch2_path_to_fs(const char *path)
+ +{
+ +      struct bch_fs *c;
+ +      dev_t dev;
+ +      int ret;
+ +
+ +      ret = lookup_bdev(path, &dev);
+ +      if (ret)
+ +              return ERR_PTR(ret);
+ +
+ +      c = bch2_dev_to_fs(dev);
+ +      if (c)
+ +              closure_put(&c->cl);
+ +      return c ?: ERR_PTR(-ENOENT);
+ +}
+ +
+ +static char **split_devs(const char *_dev_name, unsigned *nr)
+ +{
+ +      char *dev_name = NULL, **devs = NULL, *s;
+ +      size_t i = 0, nr_devs = 0;
+ +
+ +      dev_name = kstrdup(_dev_name, GFP_KERNEL);
+ +      if (!dev_name)
+ +              return NULL;
+ +
+ +      for (s = dev_name; s; s = strchr(s + 1, ':'))
+ +              nr_devs++;
+ +
+ +      devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL);
+ +      if (!devs) {
+ +              kfree(dev_name);
+ +              return NULL;
+ +      }
+ +
+ +      while ((s = strsep(&dev_name, ":")))
+ +              devs[i++] = s;
+ +
+ +      *nr = nr_devs;
+ +      return devs;
+ +}
+ +
+ +static int bch2_remount(struct super_block *sb, int *flags, char *data)
+ +{
+ +      struct bch_fs *c = sb->s_fs_info;
+ +      struct bch_opts opts = bch2_opts_empty();
+ +      int ret;
+ +
+ +      opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
+ +
+ +      ret = bch2_parse_mount_opts(c, &opts, data);
+ +      if (ret)
+ +              goto err;
+ +
+ +      if (opts.read_only != c->opts.read_only) {
+ +              down_write(&c->state_lock);
+ +
+ +              if (opts.read_only) {
+ +                      bch2_fs_read_only(c);
+ +
+ +                      sb->s_flags |= SB_RDONLY;
+ +              } else {
+ +                      ret = bch2_fs_read_write(c);
+ +                      if (ret) {
+ +                              bch_err(c, "error going rw: %i", ret);
+ +                              up_write(&c->state_lock);
+ +                              ret = -EINVAL;
+ +                              goto err;
+ +                      }
+ +
+ +                      sb->s_flags &= ~SB_RDONLY;
+ +              }
+ +
+ +              c->opts.read_only = opts.read_only;
+ +
+ +              up_write(&c->state_lock);
+ +      }
+ +
+ +      if (opt_defined(opts, errors))
+ +              c->opts.errors = opts.errors;
+ +err:
+ +      return bch2_err_class(ret);
+ +}
+ +
+ +static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
+ +{
+ +      struct bch_fs *c = root->d_sb->s_fs_info;
+ +      struct bch_dev *ca;
+ +      unsigned i;
+ +      bool first = true;
+ +
+ +      for_each_online_member(ca, c, i) {
+ +              if (!first)
+ +                      seq_putc(seq, ':');
+ +              first = false;
+ +              seq_puts(seq, "/dev/");
+ +              seq_puts(seq, ca->name);
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +static int bch2_show_options(struct seq_file *seq, struct dentry *root)
+ +{
+ +      struct bch_fs *c = root->d_sb->s_fs_info;
+ +      enum bch_opt_id i;
+ +      struct printbuf buf = PRINTBUF;
+ +      int ret = 0;
+ +
+ +      for (i = 0; i < bch2_opts_nr; i++) {
+ +              const struct bch_option *opt = &bch2_opt_table[i];
+ +              u64 v = bch2_opt_get_by_id(&c->opts, i);
+ +
+ +              if (!(opt->flags & OPT_MOUNT))
+ +                      continue;
+ +
+ +              if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+ +                      continue;
+ +
+ +              printbuf_reset(&buf);
+ +              bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
+ +                               OPT_SHOW_MOUNT_STYLE);
+ +              seq_putc(seq, ',');
+ +              seq_puts(seq, buf.buf);
+ +      }
+ +
+ +      if (buf.allocation_failure)
+ +              ret = -ENOMEM;
+ +      printbuf_exit(&buf);
+ +      return ret;
+ +}
+ +
+ +static void bch2_put_super(struct super_block *sb)
+ +{
+ +      struct bch_fs *c = sb->s_fs_info;
+ +
+ +      __bch2_fs_stop(c);
+ +}
+ +
+ +/*
+ + * bcachefs doesn't currently integrate intwrite freeze protection but the
+ + * internal write references serve the same purpose. Therefore reuse the
+ + * read-only transition code to perform the quiesce. The caveat is that we don't
+ + * currently have the ability to block tasks that want a write reference while
+ + * the superblock is frozen. This is fine for now, but we should either add
+ + * blocking support or find a way to integrate sb_start_intwrite() and friends.
+ + */
+ +static int bch2_freeze(struct super_block *sb)
+ +{
+ +      struct bch_fs *c = sb->s_fs_info;
+ +
+ +      down_write(&c->state_lock);
+ +      bch2_fs_read_only(c);
+ +      up_write(&c->state_lock);
+ +      return 0;
+ +}
+ +
+ +static int bch2_unfreeze(struct super_block *sb)
+ +{
+ +      struct bch_fs *c = sb->s_fs_info;
+ +      int ret;
+ +
+ +      down_write(&c->state_lock);
+ +      ret = bch2_fs_read_write(c);
+ +      up_write(&c->state_lock);
+ +      return ret;
+ +}
+ +
+ +static const struct super_operations bch_super_operations = {
+ +      .alloc_inode    = bch2_alloc_inode,
+ +      .destroy_inode  = bch2_destroy_inode,
+ +      .write_inode    = bch2_vfs_write_inode,
+ +      .evict_inode    = bch2_evict_inode,
+ +      .sync_fs        = bch2_sync_fs,
+ +      .statfs         = bch2_statfs,
+ +      .show_devname   = bch2_show_devname,
+ +      .show_options   = bch2_show_options,
+ +      .remount_fs     = bch2_remount,
+ +      .put_super      = bch2_put_super,
+ +      .freeze_fs      = bch2_freeze,
+ +      .unfreeze_fs    = bch2_unfreeze,
+ +};
+ +
+ +static int bch2_set_super(struct super_block *s, void *data)
+ +{
+ +      s->s_fs_info = data;
+ +      return 0;
+ +}
+ +
+ +static int bch2_noset_super(struct super_block *s, void *data)
+ +{
+ +      return -EBUSY;
+ +}
+ +
+ +static int bch2_test_super(struct super_block *s, void *data)
+ +{
+ +      struct bch_fs *c = s->s_fs_info;
+ +      struct bch_fs **devs = data;
+ +      unsigned i;
+ +
+ +      if (!c)
+ +              return false;
+ +
+ +      for (i = 0; devs[i]; i++)
+ +              if (c != devs[i])
+ +                      return false;
+ +      return true;
+ +}
+ +
+ +static struct dentry *bch2_mount(struct file_system_type *fs_type,
+ +                               int flags, const char *dev_name, void *data)
+ +{
+ +      struct bch_fs *c;
+ +      struct bch_dev *ca;
+ +      struct super_block *sb;
+ +      struct inode *vinode;
+ +      struct bch_opts opts = bch2_opts_empty();
+ +      char **devs;
+ +      struct bch_fs **devs_to_fs = NULL;
+ +      unsigned i, nr_devs;
+ +      int ret;
+ +
+ +      opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
+ +
+ +      ret = bch2_parse_mount_opts(NULL, &opts, data);
+ +      if (ret)
+ +              return ERR_PTR(ret);
+ +
+ +      if (!dev_name || strlen(dev_name) == 0)
+ +              return ERR_PTR(-EINVAL);
+ +
+ +      devs = split_devs(dev_name, &nr_devs);
+ +      if (!devs)
+ +              return ERR_PTR(-ENOMEM);
+ +
+ +      devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL);
+ +      if (!devs_to_fs) {
+ +              sb = ERR_PTR(-ENOMEM);
+ +              goto got_sb;
+ +      }
+ +
+ +      for (i = 0; i < nr_devs; i++)
+ +              devs_to_fs[i] = bch2_path_to_fs(devs[i]);
+ +
+ +      sb = sget(fs_type, bch2_test_super, bch2_noset_super,
+ +                flags|SB_NOSEC, devs_to_fs);
+ +      if (!IS_ERR(sb))
+ +              goto got_sb;
+ +
+ +      c = bch2_fs_open(devs, nr_devs, opts);
+ +      if (IS_ERR(c)) {
+ +              sb = ERR_CAST(c);
+ +              goto got_sb;
+ +      }
+ +
+ +      /* Some options can't be parsed until after the fs is started: */
+ +      ret = bch2_parse_mount_opts(c, &opts, data);
+ +      if (ret) {
+ +              bch2_fs_stop(c);
+ +              sb = ERR_PTR(ret);
+ +              goto got_sb;
+ +      }
+ +
+ +      bch2_opts_apply(&c->opts, opts);
+ +
+ +      sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
+ +      if (IS_ERR(sb))
+ +              bch2_fs_stop(c);
+ +got_sb:
+ +      kfree(devs_to_fs);
+ +      kfree(devs[0]);
+ +      kfree(devs);
+ +
+ +      if (IS_ERR(sb)) {
+ +              ret = PTR_ERR(sb);
+ +              ret = bch2_err_class(ret);
+ +              return ERR_PTR(ret);
+ +      }
+ +
+ +      c = sb->s_fs_info;
+ +
+ +      if (sb->s_root) {
+ +              if ((flags ^ sb->s_flags) & SB_RDONLY) {
+ +                      ret = -EBUSY;
+ +                      goto err_put_super;
+ +              }
+ +              goto out;
+ +      }
+ +
+ +      sb->s_blocksize         = block_bytes(c);
+ +      sb->s_blocksize_bits    = ilog2(block_bytes(c));
+ +      sb->s_maxbytes          = MAX_LFS_FILESIZE;
+ +      sb->s_op                = &bch_super_operations;
+ +      sb->s_export_op         = &bch_export_ops;
+ +#ifdef CONFIG_BCACHEFS_QUOTA
+ +      sb->s_qcop              = &bch2_quotactl_operations;
+ +      sb->s_quota_types       = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
+ +#endif
+ +      sb->s_xattr             = bch2_xattr_handlers;
+ +      sb->s_magic             = BCACHEFS_STATFS_MAGIC;
+ +      sb->s_time_gran         = c->sb.nsec_per_time_unit;
+ +      sb->s_time_min          = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
+ +      sb->s_time_max          = div_s64(S64_MAX, c->sb.time_units_per_sec);
+ +      c->vfs_sb               = sb;
+ +      strscpy(sb->s_id, c->name, sizeof(sb->s_id));
+ +
+ +      ret = super_setup_bdi(sb);
+ +      if (ret)
+ +              goto err_put_super;
+ +
+ +      sb->s_bdi->ra_pages             = VM_READAHEAD_PAGES;
+ +
+ +      for_each_online_member(ca, c, i) {
+ +              struct block_device *bdev = ca->disk_sb.bdev;
+ +
+ +              /* XXX: create an anonymous device for multi device filesystems */
+ +              sb->s_bdev      = bdev;
+ +              sb->s_dev       = bdev->bd_dev;
+ +              percpu_ref_put(&ca->io_ref);
+ +              break;
+ +      }
+ +
+ +      c->dev = sb->s_dev;
+ +
+ +#ifdef CONFIG_BCACHEFS_POSIX_ACL
+ +      if (c->opts.acl)
+ +              sb->s_flags     |= SB_POSIXACL;
+ +#endif
+ +
++      sb->s_shrink->seeks = 0;
+ +
+ +      vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
+ +      ret = PTR_ERR_OR_ZERO(vinode);
+ +      if (ret) {
+ +              bch_err_msg(c, ret, "mounting: error getting root inode");
+ +              goto err_put_super;
+ +      }
+ +
+ +      sb->s_root = d_make_root(vinode);
+ +      if (!sb->s_root) {
+ +              bch_err(c, "error mounting: error allocating root dentry");
+ +              ret = -ENOMEM;
+ +              goto err_put_super;
+ +      }
+ +
+ +      sb->s_flags |= SB_ACTIVE;
+ +out:
+ +      return dget(sb->s_root);
+ +
+ +err_put_super:
+ +      sb->s_fs_info = NULL;
+ +      c->vfs_sb = NULL;
+ +      deactivate_locked_super(sb);
+ +      bch2_fs_stop(c);
+ +      return ERR_PTR(bch2_err_class(ret));
+ +}
+ +
+ +static void bch2_kill_sb(struct super_block *sb)
+ +{
+ +      struct bch_fs *c = sb->s_fs_info;
+ +
+ +      if (c)
+ +              c->vfs_sb = NULL;
+ +      generic_shutdown_super(sb);
+ +      if (c)
+ +              bch2_fs_free(c);
+ +}
+ +
+ +static struct file_system_type bcache_fs_type = {
+ +      .owner          = THIS_MODULE,
+ +      .name           = "bcachefs",
+ +      .mount          = bch2_mount,
+ +      .kill_sb        = bch2_kill_sb,
+ +      .fs_flags       = FS_REQUIRES_DEV,
+ +};
+ +
+ +MODULE_ALIAS_FS("bcachefs");
+ +
+ +void bch2_vfs_exit(void)
+ +{
+ +      unregister_filesystem(&bcache_fs_type);
+ +      kmem_cache_destroy(bch2_inode_cache);
+ +}
+ +
+ +int __init bch2_vfs_init(void)
+ +{
+ +      int ret = -ENOMEM;
+ +
+ +      bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
+ +      if (!bch2_inode_cache)
+ +              goto err;
+ +
+ +      ret = register_filesystem(&bcache_fs_type);
+ +      if (ret)
+ +              goto err;
+ +
+ +      return 0;
+ +err:
+ +      bch2_vfs_exit();
+ +      return ret;
+ +}
+ +
+ +#endif /* NO_BCACHEFS_FS */
diff --cc fs/bcachefs/sysfs.c

index eb764b9a4629696e9444103c272ad01ed07643c0,0000000000000000000000000000000000000000..397116966a7cd40ef629b98cf16670476ff583a6

mode 100644,000000..100644
--- 1/fs/bcachefs/sysfs.c
--- /dev/null
+++ b/fs/bcachefs/sysfs.c
@@@ -1,1031 -1,0 +1,1031 @@@
-               c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
+ +// SPDX-License-Identifier: GPL-2.0
+ +/*
+ + * bcache sysfs interfaces
+ + *
+ + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ + * Copyright 2012 Google, Inc.
+ + */
+ +
+ +#ifndef NO_BCACHEFS_SYSFS
+ +
+ +#include "bcachefs.h"
+ +#include "alloc_background.h"
+ +#include "alloc_foreground.h"
+ +#include "sysfs.h"
+ +#include "btree_cache.h"
+ +#include "btree_io.h"
+ +#include "btree_iter.h"
+ +#include "btree_key_cache.h"
+ +#include "btree_update.h"
+ +#include "btree_update_interior.h"
+ +#include "btree_gc.h"
+ +#include "buckets.h"
+ +#include "clock.h"
+ +#include "disk_groups.h"
+ +#include "ec.h"
+ +#include "inode.h"
+ +#include "journal.h"
+ +#include "keylist.h"
+ +#include "move.h"
+ +#include "movinggc.h"
+ +#include "nocow_locking.h"
+ +#include "opts.h"
+ +#include "rebalance.h"
+ +#include "replicas.h"
+ +#include "super-io.h"
+ +#include "tests.h"
+ +
+ +#include <linux/blkdev.h>
+ +#include <linux/sort.h>
+ +#include <linux/sched/clock.h>
+ +
+ +#include "util.h"
+ +
+ +#define SYSFS_OPS(type)                                                       \
+ +const struct sysfs_ops type ## _sysfs_ops = {                         \
+ +      .show   = type ## _show,                                        \
+ +      .store  = type ## _store                                        \
+ +}
+ +
+ +#define SHOW(fn)                                                      \
+ +static ssize_t fn ## _to_text(struct printbuf *,                      \
+ +                            struct kobject *, struct attribute *);    \
+ +                                                                      \
+ +static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
+ +                         char *buf)                                   \
+ +{                                                                     \
+ +      struct printbuf out = PRINTBUF;                                 \
+ +      ssize_t ret = fn ## _to_text(&out, kobj, attr);                 \
+ +                                                                      \
+ +      if (out.pos && out.buf[out.pos - 1] != '\n')                    \
+ +              prt_newline(&out);                                      \
+ +                                                                      \
+ +      if (!ret && out.allocation_failure)                             \
+ +              ret = -ENOMEM;                                          \
+ +                                                                      \
+ +      if (!ret) {                                                     \
+ +              ret = min_t(size_t, out.pos, PAGE_SIZE - 1);            \
+ +              memcpy(buf, out.buf, ret);                              \
+ +      }                                                               \
+ +      printbuf_exit(&out);                                            \
+ +      return bch2_err_class(ret);                                     \
+ +}                                                                     \
+ +                                                                      \
+ +static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
+ +                            struct attribute *attr)
+ +
+ +#define STORE(fn)                                                     \
+ +static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\
+ +                          const char *, size_t);                      \
+ +                                                                      \
+ +static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
+ +                          const char *buf, size_t size)               \
+ +{                                                                     \
+ +      return bch2_err_class(fn##_store_inner(kobj, attr, buf, size)); \
+ +}                                                                     \
+ +                                                                      \
+ +static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\
+ +                                const char *buf, size_t size)
+ +
+ +#define __sysfs_attribute(_name, _mode)                                       \
+ +      static struct attribute sysfs_##_name =                         \
+ +              { .name = #_name, .mode = _mode }
+ +
+ +#define write_attribute(n)    __sysfs_attribute(n, 0200)
+ +#define read_attribute(n)     __sysfs_attribute(n, 0444)
+ +#define rw_attribute(n)               __sysfs_attribute(n, 0644)
+ +
+ +#define sysfs_printf(file, fmt, ...)                                  \
+ +do {                                                                  \
+ +      if (attr == &sysfs_ ## file)                                    \
+ +              prt_printf(out, fmt "\n", __VA_ARGS__);                 \
+ +} while (0)
+ +
+ +#define sysfs_print(file, var)                                                \
+ +do {                                                                  \
+ +      if (attr == &sysfs_ ## file)                                    \
+ +              snprint(out, var);                                      \
+ +} while (0)
+ +
+ +#define sysfs_hprint(file, val)                                               \
+ +do {                                                                  \
+ +      if (attr == &sysfs_ ## file)                                    \
+ +              prt_human_readable_s64(out, val);                       \
+ +} while (0)
+ +
+ +#define sysfs_strtoul(file, var)                                      \
+ +do {                                                                  \
+ +      if (attr == &sysfs_ ## file)                                    \
+ +              return strtoul_safe(buf, var) ?: (ssize_t) size;        \
+ +} while (0)
+ +
+ +#define sysfs_strtoul_clamp(file, var, min, max)                      \
+ +do {                                                                  \
+ +      if (attr == &sysfs_ ## file)                                    \
+ +              return strtoul_safe_clamp(buf, var, min, max)           \
+ +                      ?: (ssize_t) size;                              \
+ +} while (0)
+ +
+ +#define strtoul_or_return(cp)                                         \
+ +({                                                                    \
+ +      unsigned long _v;                                               \
+ +      int _r = kstrtoul(cp, 10, &_v);                                 \
+ +      if (_r)                                                         \
+ +              return _r;                                              \
+ +      _v;                                                             \
+ +})
+ +
+ +write_attribute(trigger_gc);
+ +write_attribute(trigger_discards);
+ +write_attribute(trigger_invalidates);
+ +write_attribute(prune_cache);
+ +write_attribute(btree_wakeup);
+ +rw_attribute(btree_gc_periodic);
+ +rw_attribute(gc_gens_pos);
+ +
+ +read_attribute(uuid);
+ +read_attribute(minor);
+ +read_attribute(bucket_size);
+ +read_attribute(first_bucket);
+ +read_attribute(nbuckets);
+ +rw_attribute(durability);
+ +read_attribute(iodone);
+ +
+ +read_attribute(io_latency_read);
+ +read_attribute(io_latency_write);
+ +read_attribute(io_latency_stats_read);
+ +read_attribute(io_latency_stats_write);
+ +read_attribute(congested);
+ +
+ +read_attribute(btree_write_stats);
+ +
+ +read_attribute(btree_cache_size);
+ +read_attribute(compression_stats);
+ +read_attribute(journal_debug);
+ +read_attribute(btree_updates);
+ +read_attribute(btree_cache);
+ +read_attribute(btree_key_cache);
+ +read_attribute(stripes_heap);
+ +read_attribute(open_buckets);
+ +read_attribute(open_buckets_partial);
+ +read_attribute(write_points);
+ +read_attribute(nocow_lock_table);
+ +
+ +#ifdef BCH_WRITE_REF_DEBUG
+ +read_attribute(write_refs);
+ +
+ +static const char * const bch2_write_refs[] = {
+ +#define x(n)  #n,
+ +      BCH_WRITE_REFS()
+ +#undef x
+ +      NULL
+ +};
+ +
+ +static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
+ +{
+ +      bch2_printbuf_tabstop_push(out, 24);
+ +
+ +      for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) {
+ +              prt_str(out, bch2_write_refs[i]);
+ +              prt_tab(out);
+ +              prt_printf(out, "%li", atomic_long_read(&c->writes[i]));
+ +              prt_newline(out);
+ +      }
+ +}
+ +#endif
+ +
+ +read_attribute(internal_uuid);
+ +read_attribute(disk_groups);
+ +
+ +read_attribute(has_data);
+ +read_attribute(alloc_debug);
+ +
+ +#define x(t, n, ...) read_attribute(t);
+ +BCH_PERSISTENT_COUNTERS()
+ +#undef x
+ +
+ +rw_attribute(discard);
+ +rw_attribute(label);
+ +
+ +rw_attribute(copy_gc_enabled);
+ +read_attribute(copy_gc_wait);
+ +
+ +rw_attribute(rebalance_enabled);
+ +sysfs_pd_controller_attribute(rebalance);
+ +read_attribute(rebalance_work);
+ +rw_attribute(promote_whole_extents);
+ +
+ +read_attribute(new_stripes);
+ +
+ +read_attribute(io_timers_read);
+ +read_attribute(io_timers_write);
+ +
+ +read_attribute(moving_ctxts);
+ +
+ +#ifdef CONFIG_BCACHEFS_TESTS
+ +write_attribute(perf_test);
+ +#endif /* CONFIG_BCACHEFS_TESTS */
+ +
+ +#define x(_name)                                              \
+ +      static struct attribute sysfs_time_stat_##_name =               \
+ +              { .name = #_name, .mode = 0444 };
+ +      BCH_TIME_STATS()
+ +#undef x
+ +
+ +static struct attribute sysfs_state_rw = {
+ +      .name = "state",
+ +      .mode =  0444,
+ +};
+ +
+ +static size_t bch2_btree_cache_size(struct bch_fs *c)
+ +{
+ +      size_t ret = 0;
+ +      struct btree *b;
+ +
+ +      mutex_lock(&c->btree_cache.lock);
+ +      list_for_each_entry(b, &c->btree_cache.live, list)
+ +              ret += btree_bytes(c);
+ +
+ +      mutex_unlock(&c->btree_cache.lock);
+ +      return ret;
+ +}
+ +
+ +static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
+ +{
+ +      struct btree_trans *trans;
+ +      struct btree_iter iter;
+ +      struct bkey_s_c k;
+ +      enum btree_id id;
+ +      u64 nr_uncompressed_extents = 0,
+ +          nr_compressed_extents = 0,
+ +          nr_incompressible_extents = 0,
+ +          uncompressed_sectors = 0,
+ +          incompressible_sectors = 0,
+ +          compressed_sectors_compressed = 0,
+ +          compressed_sectors_uncompressed = 0;
+ +      int ret = 0;
+ +
+ +      if (!test_bit(BCH_FS_STARTED, &c->flags))
+ +              return -EPERM;
+ +
+ +      trans = bch2_trans_get(c);
+ +
+ +      for (id = 0; id < BTREE_ID_NR; id++) {
+ +              if (!btree_type_has_ptrs(id))
+ +                      continue;
+ +
+ +              for_each_btree_key(trans, iter, id, POS_MIN,
+ +                                 BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ +                      struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ +                      const union bch_extent_entry *entry;
+ +                      struct extent_ptr_decoded p;
+ +                      bool compressed = false, uncompressed = false, incompressible = false;
+ +
+ +                      bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ +                              switch (p.crc.compression_type) {
+ +                              case BCH_COMPRESSION_TYPE_none:
+ +                                      uncompressed = true;
+ +                                      uncompressed_sectors += k.k->size;
+ +                                      break;
+ +                              case BCH_COMPRESSION_TYPE_incompressible:
+ +                                      incompressible = true;
+ +                                      incompressible_sectors += k.k->size;
+ +                                      break;
+ +                              default:
+ +                                      compressed_sectors_compressed +=
+ +                                              p.crc.compressed_size;
+ +                                      compressed_sectors_uncompressed +=
+ +                                              p.crc.uncompressed_size;
+ +                                      compressed = true;
+ +                                      break;
+ +                              }
+ +                      }
+ +
+ +                      if (incompressible)
+ +                              nr_incompressible_extents++;
+ +                      else if (uncompressed)
+ +                              nr_uncompressed_extents++;
+ +                      else if (compressed)
+ +                              nr_compressed_extents++;
+ +              }
+ +              bch2_trans_iter_exit(trans, &iter);
+ +      }
+ +
+ +      bch2_trans_put(trans);
+ +
+ +      if (ret)
+ +              return ret;
+ +
+ +      prt_printf(out, "uncompressed:\n");
+ +      prt_printf(out, "       nr extents:             %llu\n", nr_uncompressed_extents);
+ +      prt_printf(out, "       size:                   ");
+ +      prt_human_readable_u64(out, uncompressed_sectors << 9);
+ +      prt_printf(out, "\n");
+ +
+ +      prt_printf(out, "compressed:\n");
+ +      prt_printf(out, "       nr extents:             %llu\n", nr_compressed_extents);
+ +      prt_printf(out, "       compressed size:        ");
+ +      prt_human_readable_u64(out, compressed_sectors_compressed << 9);
+ +      prt_printf(out, "\n");
+ +      prt_printf(out, "       uncompressed size:      ");
+ +      prt_human_readable_u64(out, compressed_sectors_uncompressed << 9);
+ +      prt_printf(out, "\n");
+ +
+ +      prt_printf(out, "incompressible:\n");
+ +      prt_printf(out, "       nr extents:             %llu\n", nr_incompressible_extents);
+ +      prt_printf(out, "       size:                   ");
+ +      prt_human_readable_u64(out, incompressible_sectors << 9);
+ +      prt_printf(out, "\n");
+ +      return 0;
+ +}
+ +
+ +static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
+ +{
+ +      prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
+ +      bch2_bpos_to_text(out, c->gc_gens_pos);
+ +      prt_printf(out, "\n");
+ +}
+ +
+ +static void bch2_btree_wakeup_all(struct bch_fs *c)
+ +{
+ +      struct btree_trans *trans;
+ +
+ +      seqmutex_lock(&c->btree_trans_lock);
+ +      list_for_each_entry(trans, &c->btree_trans_list, list) {
+ +              struct btree_bkey_cached_common *b = READ_ONCE(trans->locking);
+ +
+ +              if (b)
+ +                      six_lock_wakeup_all(&b->lock);
+ +
+ +      }
+ +      seqmutex_unlock(&c->btree_trans_lock);
+ +}
+ +
+ +SHOW(bch2_fs)
+ +{
+ +      struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+ +
+ +      sysfs_print(minor,                      c->minor);
+ +      sysfs_printf(internal_uuid, "%pU",      c->sb.uuid.b);
+ +
+ +      sysfs_hprint(btree_cache_size,          bch2_btree_cache_size(c));
+ +
+ +      if (attr == &sysfs_btree_write_stats)
+ +              bch2_btree_write_stats_to_text(out, c);
+ +
+ +      sysfs_printf(btree_gc_periodic, "%u",   (int) c->btree_gc_periodic);
+ +
+ +      if (attr == &sysfs_gc_gens_pos)
+ +              bch2_gc_gens_pos_to_text(out, c);
+ +
+ +      sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+ +
+ +      sysfs_printf(rebalance_enabled,         "%i", c->rebalance.enabled);
+ +      sysfs_pd_controller_show(rebalance,     &c->rebalance.pd); /* XXX */
+ +
+ +      if (attr == &sysfs_copy_gc_wait)
+ +              bch2_copygc_wait_to_text(out, c);
+ +
+ +      if (attr == &sysfs_rebalance_work)
+ +              bch2_rebalance_work_to_text(out, c);
+ +
+ +      sysfs_print(promote_whole_extents,      c->promote_whole_extents);
+ +
+ +      /* Debugging: */
+ +
+ +      if (attr == &sysfs_journal_debug)
+ +              bch2_journal_debug_to_text(out, &c->journal);
+ +
+ +      if (attr == &sysfs_btree_updates)
+ +              bch2_btree_updates_to_text(out, c);
+ +
+ +      if (attr == &sysfs_btree_cache)
+ +              bch2_btree_cache_to_text(out, c);
+ +
+ +      if (attr == &sysfs_btree_key_cache)
+ +              bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
+ +
+ +      if (attr == &sysfs_stripes_heap)
+ +              bch2_stripes_heap_to_text(out, c);
+ +
+ +      if (attr == &sysfs_open_buckets)
+ +              bch2_open_buckets_to_text(out, c);
+ +
+ +      if (attr == &sysfs_open_buckets_partial)
+ +              bch2_open_buckets_partial_to_text(out, c);
+ +
+ +      if (attr == &sysfs_write_points)
+ +              bch2_write_points_to_text(out, c);
+ +
+ +      if (attr == &sysfs_compression_stats)
+ +              bch2_compression_stats_to_text(out, c);
+ +
+ +      if (attr == &sysfs_new_stripes)
+ +              bch2_new_stripes_to_text(out, c);
+ +
+ +      if (attr == &sysfs_io_timers_read)
+ +              bch2_io_timers_to_text(out, &c->io_clock[READ]);
+ +
+ +      if (attr == &sysfs_io_timers_write)
+ +              bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
+ +
+ +      if (attr == &sysfs_moving_ctxts)
+ +              bch2_fs_moving_ctxts_to_text(out, c);
+ +
+ +#ifdef BCH_WRITE_REF_DEBUG
+ +      if (attr == &sysfs_write_refs)
+ +              bch2_write_refs_to_text(out, c);
+ +#endif
+ +
+ +      if (attr == &sysfs_nocow_lock_table)
+ +              bch2_nocow_locks_to_text(out, &c->nocow_locks);
+ +
+ +      if (attr == &sysfs_disk_groups)
+ +              bch2_disk_groups_to_text(out, c);
+ +
+ +      return 0;
+ +}
+ +
+ +STORE(bch2_fs)
+ +{
+ +      struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+ +
+ +      if (attr == &sysfs_btree_gc_periodic) {
+ +              ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
+ +                      ?: (ssize_t) size;
+ +
+ +              wake_up_process(c->gc_thread);
+ +              return ret;
+ +      }
+ +
+ +      if (attr == &sysfs_copy_gc_enabled) {
+ +              ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
+ +                      ?: (ssize_t) size;
+ +
+ +              if (c->copygc_thread)
+ +                      wake_up_process(c->copygc_thread);
+ +              return ret;
+ +      }
+ +
+ +      if (attr == &sysfs_rebalance_enabled) {
+ +              ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
+ +                      ?: (ssize_t) size;
+ +
+ +              rebalance_wakeup(c);
+ +              return ret;
+ +      }
+ +
+ +      sysfs_pd_controller_store(rebalance,    &c->rebalance.pd);
+ +
+ +      sysfs_strtoul(promote_whole_extents,    c->promote_whole_extents);
+ +
+ +      /* Debugging: */
+ +
+ +      if (!test_bit(BCH_FS_STARTED, &c->flags))
+ +              return -EPERM;
+ +
+ +      /* Debugging: */
+ +
+ +      if (!test_bit(BCH_FS_RW, &c->flags))
+ +              return -EROFS;
+ +
+ +      if (attr == &sysfs_prune_cache) {
+ +              struct shrink_control sc;
+ +
+ +              sc.gfp_mask = GFP_KERNEL;
+ +              sc.nr_to_scan = strtoul_or_return(buf);
++              c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
+ +      }
+ +
+ +      if (attr == &sysfs_btree_wakeup)
+ +              bch2_btree_wakeup_all(c);
+ +
+ +      if (attr == &sysfs_trigger_gc) {
+ +              /*
+ +               * Full gc is currently incompatible with btree key cache:
+ +               */
+ +#if 0
+ +              down_read(&c->state_lock);
+ +              bch2_gc(c, false, false);
+ +              up_read(&c->state_lock);
+ +#else
+ +              bch2_gc_gens(c);
+ +#endif
+ +      }
+ +
+ +      if (attr == &sysfs_trigger_discards)
+ +              bch2_do_discards(c);
+ +
+ +      if (attr == &sysfs_trigger_invalidates)
+ +              bch2_do_invalidates(c);
+ +
+ +#ifdef CONFIG_BCACHEFS_TESTS
+ +      if (attr == &sysfs_perf_test) {
+ +              char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
+ +              char *test              = strsep(&p, " \t\n");
+ +              char *nr_str            = strsep(&p, " \t\n");
+ +              char *threads_str       = strsep(&p, " \t\n");
+ +              unsigned threads;
+ +              u64 nr;
+ +              int ret = -EINVAL;
+ +
+ +              if (threads_str &&
+ +                  !(ret = kstrtouint(threads_str, 10, &threads)) &&
+ +                  !(ret = bch2_strtoull_h(nr_str, &nr)))
+ +                      ret = bch2_btree_perf_test(c, test, nr, threads);
+ +              kfree(tmp);
+ +
+ +              if (ret)
+ +                      size = ret;
+ +      }
+ +#endif
+ +      return size;
+ +}
+ +SYSFS_OPS(bch2_fs);
+ +
+ +struct attribute *bch2_fs_files[] = {
+ +      &sysfs_minor,
+ +      &sysfs_btree_cache_size,
+ +      &sysfs_btree_write_stats,
+ +
+ +      &sysfs_promote_whole_extents,
+ +
+ +      &sysfs_compression_stats,
+ +
+ +#ifdef CONFIG_BCACHEFS_TESTS
+ +      &sysfs_perf_test,
+ +#endif
+ +      NULL
+ +};
+ +
+ +/* counters dir */
+ +
+ +SHOW(bch2_fs_counters)
+ +{
+ +      struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj);
+ +      u64 counter = 0;
+ +      u64 counter_since_mount = 0;
+ +
+ +      printbuf_tabstop_push(out, 32);
+ +
+ +      #define x(t, ...) \
+ +              if (attr == &sysfs_##t) {                                       \
+ +                      counter             = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
+ +                      counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
+ +                      prt_printf(out, "since mount:");                                \
+ +                      prt_tab(out);                                           \
+ +                      prt_human_readable_u64(out, counter_since_mount);       \
+ +                      prt_newline(out);                                       \
+ +                                                                              \
+ +                      prt_printf(out, "since filesystem creation:");          \
+ +                      prt_tab(out);                                           \
+ +                      prt_human_readable_u64(out, counter);                   \
+ +                      prt_newline(out);                                       \
+ +              }
+ +      BCH_PERSISTENT_COUNTERS()
+ +      #undef x
+ +      return 0;
+ +}
+ +
+ +STORE(bch2_fs_counters) {
+ +      return 0;
+ +}
+ +
+ +SYSFS_OPS(bch2_fs_counters);
+ +
+ +struct attribute *bch2_fs_counters_files[] = {
+ +#define x(t, ...) \
+ +      &sysfs_##t,
+ +      BCH_PERSISTENT_COUNTERS()
+ +#undef x
+ +      NULL
+ +};
+ +/* internal dir - just a wrapper */
+ +
+ +SHOW(bch2_fs_internal)
+ +{
+ +      struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+ +
+ +      return bch2_fs_to_text(out, &c->kobj, attr);
+ +}
+ +
+ +STORE(bch2_fs_internal)
+ +{
+ +      struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+ +
+ +      return bch2_fs_store(&c->kobj, attr, buf, size);
+ +}
+ +SYSFS_OPS(bch2_fs_internal);
+ +
+ +struct attribute *bch2_fs_internal_files[] = {
+ +      &sysfs_journal_debug,
+ +      &sysfs_btree_updates,
+ +      &sysfs_btree_cache,
+ +      &sysfs_btree_key_cache,
+ +      &sysfs_new_stripes,
+ +      &sysfs_stripes_heap,
+ +      &sysfs_open_buckets,
+ +      &sysfs_open_buckets_partial,
+ +      &sysfs_write_points,
+ +#ifdef BCH_WRITE_REF_DEBUG
+ +      &sysfs_write_refs,
+ +#endif
+ +      &sysfs_nocow_lock_table,
+ +      &sysfs_io_timers_read,
+ +      &sysfs_io_timers_write,
+ +
+ +      &sysfs_trigger_gc,
+ +      &sysfs_trigger_discards,
+ +      &sysfs_trigger_invalidates,
+ +      &sysfs_prune_cache,
+ +      &sysfs_btree_wakeup,
+ +
+ +      &sysfs_gc_gens_pos,
+ +
+ +      &sysfs_copy_gc_enabled,
+ +      &sysfs_copy_gc_wait,
+ +
+ +      &sysfs_rebalance_enabled,
+ +      &sysfs_rebalance_work,
+ +      sysfs_pd_controller_files(rebalance),
+ +
+ +      &sysfs_moving_ctxts,
+ +
+ +      &sysfs_internal_uuid,
+ +
+ +      &sysfs_disk_groups,
+ +      NULL
+ +};
+ +
+ +/* options */
+ +
+ +SHOW(bch2_fs_opts_dir)
+ +{
+ +      struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+ +      const struct bch_option *opt = container_of(attr, struct bch_option, attr);
+ +      int id = opt - bch2_opt_table;
+ +      u64 v = bch2_opt_get_by_id(&c->opts, id);
+ +
+ +      bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
+ +      prt_char(out, '\n');
+ +
+ +      return 0;
+ +}
+ +
+ +STORE(bch2_fs_opts_dir)
+ +{
+ +      struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+ +      const struct bch_option *opt = container_of(attr, struct bch_option, attr);
+ +      int ret, id = opt - bch2_opt_table;
+ +      char *tmp;
+ +      u64 v;
+ +
+ +      /*
+ +       * We don't need to take c->writes for correctness, but it eliminates an
+ +       * unsightly error message in the dmesg log when we're RO:
+ +       */
+ +      if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)))
+ +              return -EROFS;
+ +
+ +      tmp = kstrdup(buf, GFP_KERNEL);
+ +      if (!tmp) {
+ +              ret = -ENOMEM;
+ +              goto err;
+ +      }
+ +
+ +      ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL);
+ +      kfree(tmp);
+ +
+ +      if (ret < 0)
+ +              goto err;
+ +
+ +      ret = bch2_opt_check_may_set(c, id, v);
+ +      if (ret < 0)
+ +              goto err;
+ +
+ +      bch2_opt_set_sb(c, opt, v);
+ +      bch2_opt_set_by_id(&c->opts, id, v);
+ +
+ +      if ((id == Opt_background_target ||
+ +           id == Opt_background_compression) && v) {
+ +              bch2_rebalance_add_work(c, S64_MAX);
+ +              rebalance_wakeup(c);
+ +      }
+ +
+ +      ret = size;
+ +err:
+ +      bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
+ +      return ret;
+ +}
+ +SYSFS_OPS(bch2_fs_opts_dir);
+ +
+ +struct attribute *bch2_fs_opts_dir_files[] = { NULL };
+ +
+ +int bch2_opts_create_sysfs_files(struct kobject *kobj)
+ +{
+ +      const struct bch_option *i;
+ +      int ret;
+ +
+ +      for (i = bch2_opt_table;
+ +           i < bch2_opt_table + bch2_opts_nr;
+ +           i++) {
+ +              if (!(i->flags & OPT_FS))
+ +                      continue;
+ +
+ +              ret = sysfs_create_file(kobj, &i->attr);
+ +              if (ret)
+ +                      return ret;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/* time stats */
+ +
+ +SHOW(bch2_fs_time_stats)
+ +{
+ +      struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
+ +
+ +#define x(name)                                                               \
+ +      if (attr == &sysfs_time_stat_##name)                            \
+ +              bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]);
+ +      BCH_TIME_STATS()
+ +#undef x
+ +
+ +      return 0;
+ +}
+ +
+ +STORE(bch2_fs_time_stats)
+ +{
+ +      return size;
+ +}
+ +SYSFS_OPS(bch2_fs_time_stats);
+ +
+ +struct attribute *bch2_fs_time_stats_files[] = {
+ +#define x(name)                                               \
+ +      &sysfs_time_stat_##name,
+ +      BCH_TIME_STATS()
+ +#undef x
+ +      NULL
+ +};
+ +
+ +static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
+ +{
+ +      struct bch_fs *c = ca->fs;
+ +      struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+ +      unsigned i, nr[BCH_DATA_NR];
+ +
+ +      memset(nr, 0, sizeof(nr));
+ +
+ +      for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
+ +              nr[c->open_buckets[i].data_type]++;
+ +
+ +      printbuf_tabstop_push(out, 8);
+ +      printbuf_tabstop_push(out, 16);
+ +      printbuf_tabstop_push(out, 16);
+ +      printbuf_tabstop_push(out, 16);
+ +      printbuf_tabstop_push(out, 16);
+ +
+ +      prt_tab(out);
+ +      prt_str(out, "buckets");
+ +      prt_tab_rjust(out);
+ +      prt_str(out, "sectors");
+ +      prt_tab_rjust(out);
+ +      prt_str(out, "fragmented");
+ +      prt_tab_rjust(out);
+ +      prt_newline(out);
+ +
+ +      for (i = 0; i < BCH_DATA_NR; i++) {
+ +              prt_str(out, bch2_data_types[i]);
+ +              prt_tab(out);
+ +              prt_u64(out, stats.d[i].buckets);
+ +              prt_tab_rjust(out);
+ +              prt_u64(out, stats.d[i].sectors);
+ +              prt_tab_rjust(out);
+ +              prt_u64(out, stats.d[i].fragmented);
+ +              prt_tab_rjust(out);
+ +              prt_newline(out);
+ +      }
+ +
+ +      prt_str(out, "ec");
+ +      prt_tab(out);
+ +      prt_u64(out, stats.buckets_ec);
+ +      prt_tab_rjust(out);
+ +      prt_newline(out);
+ +
+ +      prt_newline(out);
+ +
+ +      prt_printf(out, "reserves:");
+ +      prt_newline(out);
+ +      for (i = 0; i < BCH_WATERMARK_NR; i++) {
+ +              prt_str(out, bch2_watermarks[i]);
+ +              prt_tab(out);
+ +              prt_u64(out, bch2_dev_buckets_reserved(ca, i));
+ +              prt_tab_rjust(out);
+ +              prt_newline(out);
+ +      }
+ +
+ +      prt_newline(out);
+ +
+ +      printbuf_tabstops_reset(out);
+ +      printbuf_tabstop_push(out, 24);
+ +
+ +      prt_str(out, "freelist_wait");
+ +      prt_tab(out);
+ +      prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty");
+ +      prt_newline(out);
+ +
+ +      prt_str(out, "open buckets allocated");
+ +      prt_tab(out);
+ +      prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
+ +      prt_newline(out);
+ +
+ +      prt_str(out, "open buckets this dev");
+ +      prt_tab(out);
+ +      prt_u64(out, ca->nr_open_buckets);
+ +      prt_newline(out);
+ +
+ +      prt_str(out, "open buckets total");
+ +      prt_tab(out);
+ +      prt_u64(out, OPEN_BUCKETS_COUNT);
+ +      prt_newline(out);
+ +
+ +      prt_str(out, "open_buckets_wait");
+ +      prt_tab(out);
+ +      prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty");
+ +      prt_newline(out);
+ +
+ +      prt_str(out, "open_buckets_btree");
+ +      prt_tab(out);
+ +      prt_u64(out, nr[BCH_DATA_btree]);
+ +      prt_newline(out);
+ +
+ +      prt_str(out, "open_buckets_user");
+ +      prt_tab(out);
+ +      prt_u64(out, nr[BCH_DATA_user]);
+ +      prt_newline(out);
+ +
+ +      prt_str(out, "buckets_to_invalidate");
+ +      prt_tab(out);
+ +      prt_u64(out, should_invalidate_buckets(ca, stats));
+ +      prt_newline(out);
+ +
+ +      prt_str(out, "btree reserve cache");
+ +      prt_tab(out);
+ +      prt_u64(out, c->btree_reserve_cache_nr);
+ +      prt_newline(out);
+ +}
+ +
+ +static const char * const bch2_rw[] = {
+ +      "read",
+ +      "write",
+ +      NULL
+ +};
+ +
+ +static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca)
+ +{
+ +      int rw, i;
+ +
+ +      for (rw = 0; rw < 2; rw++) {
+ +              prt_printf(out, "%s:\n", bch2_rw[rw]);
+ +
+ +              for (i = 1; i < BCH_DATA_NR; i++)
+ +                      prt_printf(out, "%-12s:%12llu\n",
+ +                             bch2_data_types[i],
+ +                             percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
+ +      }
+ +}
+ +
+ +SHOW(bch2_dev)
+ +{
+ +      struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+ +      struct bch_fs *c = ca->fs;
+ +
+ +      sysfs_printf(uuid,              "%pU\n", ca->uuid.b);
+ +
+ +      sysfs_print(bucket_size,        bucket_bytes(ca));
+ +      sysfs_print(first_bucket,       ca->mi.first_bucket);
+ +      sysfs_print(nbuckets,           ca->mi.nbuckets);
+ +      sysfs_print(durability,         ca->mi.durability);
+ +      sysfs_print(discard,            ca->mi.discard);
+ +
+ +      if (attr == &sysfs_label) {
+ +              if (ca->mi.group) {
+ +                      mutex_lock(&c->sb_lock);
+ +                      bch2_disk_path_to_text(out, c->disk_sb.sb,
+ +                                             ca->mi.group - 1);
+ +                      mutex_unlock(&c->sb_lock);
+ +              }
+ +
+ +              prt_char(out, '\n');
+ +      }
+ +
+ +      if (attr == &sysfs_has_data) {
+ +              prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca));
+ +              prt_char(out, '\n');
+ +      }
+ +
+ +      if (attr == &sysfs_state_rw) {
+ +              prt_string_option(out, bch2_member_states, ca->mi.state);
+ +              prt_char(out, '\n');
+ +      }
+ +
+ +      if (attr == &sysfs_iodone)
+ +              dev_iodone_to_text(out, ca);
+ +
+ +      sysfs_print(io_latency_read,            atomic64_read(&ca->cur_latency[READ]));
+ +      sysfs_print(io_latency_write,           atomic64_read(&ca->cur_latency[WRITE]));
+ +
+ +      if (attr == &sysfs_io_latency_stats_read)
+ +              bch2_time_stats_to_text(out, &ca->io_latency[READ]);
+ +
+ +      if (attr == &sysfs_io_latency_stats_write)
+ +              bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
+ +
+ +      sysfs_printf(congested,                 "%u%%",
+ +                   clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
+ +                   * 100 / CONGESTED_MAX);
+ +
+ +      if (attr == &sysfs_alloc_debug)
+ +              dev_alloc_debug_to_text(out, ca);
+ +
+ +      return 0;
+ +}
+ +
+ +STORE(bch2_dev)
+ +{
+ +      struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+ +      struct bch_fs *c = ca->fs;
+ +      struct bch_member *mi;
+ +
+ +      if (attr == &sysfs_discard) {
+ +              bool v = strtoul_or_return(buf);
+ +
+ +              mutex_lock(&c->sb_lock);
+ +              mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ +
+ +              if (v != BCH_MEMBER_DISCARD(mi)) {
+ +                      SET_BCH_MEMBER_DISCARD(mi, v);
+ +                      bch2_write_super(c);
+ +              }
+ +              mutex_unlock(&c->sb_lock);
+ +      }
+ +
+ +      if (attr == &sysfs_durability) {
+ +              u64 v = strtoul_or_return(buf);
+ +
+ +              mutex_lock(&c->sb_lock);
+ +              mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ +
+ +              if (v + 1 != BCH_MEMBER_DURABILITY(mi)) {
+ +                      SET_BCH_MEMBER_DURABILITY(mi, v + 1);
+ +                      bch2_write_super(c);
+ +              }
+ +              mutex_unlock(&c->sb_lock);
+ +      }
+ +
+ +      if (attr == &sysfs_label) {
+ +              char *tmp;
+ +              int ret;
+ +
+ +              tmp = kstrdup(buf, GFP_KERNEL);
+ +              if (!tmp)
+ +                      return -ENOMEM;
+ +
+ +              ret = bch2_dev_group_set(c, ca, strim(tmp));
+ +              kfree(tmp);
+ +              if (ret)
+ +                      return ret;
+ +      }
+ +
+ +      return size;
+ +}
+ +SYSFS_OPS(bch2_dev);
+ +
+ +struct attribute *bch2_dev_files[] = {
+ +      &sysfs_uuid,
+ +      &sysfs_bucket_size,
+ +      &sysfs_first_bucket,
+ +      &sysfs_nbuckets,
+ +      &sysfs_durability,
+ +
+ +      /* settings: */
+ +      &sysfs_discard,
+ +      &sysfs_state_rw,
+ +      &sysfs_label,
+ +
+ +      &sysfs_has_data,
+ +      &sysfs_iodone,
+ +
+ +      &sysfs_io_latency_read,
+ +      &sysfs_io_latency_write,
+ +      &sysfs_io_latency_stats_read,
+ +      &sysfs_io_latency_stats_write,
+ +      &sysfs_congested,
+ +
+ +      /* debug: */
+ +      &sysfs_alloc_debug,
+ +      NULL
+ +};
+ +
+ +#endif  /* _BCACHEFS_SYSFS_H_ */
diff --cc fs/btrfs/super.c
Simple merge
diff --cc fs/erofs/utils.c
Simple merge
diff --cc fs/ext4/ext4.h
Simple merge
diff --cc fs/ext4/extents_status.c
Simple merge
diff --cc fs/ext4/inode.c
Simple merge
diff --cc fs/ext4/super.c
Simple merge
diff --cc fs/f2fs/super.c
Simple merge
diff --cc fs/gfs2/bmap.c
Simple merge
diff --cc fs/gfs2/glock.c
Simple merge
diff --cc fs/gfs2/quota.c
Simple merge
diff --cc fs/hugetlbfs/inode.c
Simple merge
diff --cc fs/iomap/buffered-io.c
Simple merge
diff --cc fs/nfs/super.c
Simple merge
diff --cc fs/nfsd/filecache.c
Simple merge
diff --cc fs/nfsd/nfs4state.c
Simple merge
diff --cc fs/ntfs3/file.c
Simple merge
diff --cc fs/ocfs2/aops.c
Simple merge
diff --cc fs/proc/task_mmu.c
Simple merge
diff --cc fs/quota/dquot.c
Simple merge
diff --cc fs/reiserfs/inode.c
Simple merge
diff --cc fs/super.c
Simple merge
diff --cc fs/ubifs/super.c
Simple merge
diff --cc fs/ufs/inode.c
Simple merge
diff --cc fs/xfs/xfs_buf.c

index 003e157241da1e39a888f351f70d33b94f25e8b2,9e7ba04572db0526db853e4922605077fbefa129..545c7991b9b584cb576d33d55cdd67826e0fa753
--- 1/fs/xfs/xfs_buf.c
--- 2/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@@ -1945,7 -1943,9 +1943,7 @@@ voi
   xfs_free_buftarg(
         struct xfs_buftarg      *btp)
   {
-       unregister_shrinker(&btp->bt_shrinker);
- -      struct block_device     *bdev = btp->bt_bdev;
- -
+       shrinker_free(btp->bt_shrinker);
         ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
         percpu_counter_destroy(&btp->bt_io_count);
         list_lru_destroy(&btp->bt_lru);
diff --cc fs/xfs/xfs_buf.h
Simple merge
diff --cc include/linux/cgroup-defs.h
Simple merge
diff --cc include/linux/fs.h
Simple merge
diff --cc include/linux/mm.h
Simple merge
diff --cc include/linux/mm_types.h
Simple merge
diff --cc include/linux/sched.h
Simple merge
diff --cc include/linux/sched/numa_balancing.h
Simple merge
diff --cc kernel/cgroup/cgroup.c
Simple merge
diff --cc kernel/exit.c
Simple merge
diff --cc kernel/fork.c
Simple merge
diff --cc kernel/rcu/tree.c
Simple merge
diff --cc kernel/sched/fair.c
Simple merge
diff --cc mm/mempolicy.c
Simple merge
diff --cc mm/mmap.c
Simple merge
diff --cc mm/nommu.c
Simple merge
diff --cc mm/percpu.c
Simple merge
diff --cc mm/shmem.c

index 6b102965d355f6865693c7ea91a5f6386970388b,a314a25aea8cceea80b920eebd1d89734941f333..71b8d957b63bec8384feb8c369289afcd95d65b4
--- 1/mm/shmem.c
--- 2/mm/shmem.c
+++ b/mm/shmem.c
@@@ -3339,11 -3349,11 +3349,11 @@@ static int shmem_unlink(struct inode *d
         simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
   
         dir->i_size -= BOGO_DIRENT_SIZE;
- -      dir->i_mtime = inode_set_ctime_to_ts(dir,
- -                                           inode_set_ctime_current(inode));
+ +      inode_set_mtime_to_ts(dir,
+ +                            inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
         inode_inc_iversion(dir);
         drop_nlink(inode);
-       dput(dentry);   /* Undo the count from "create" - this does all the work */
+       dput(dentry);   /* Undo the count from "create" - does all the work */
         return 0;
   }
   
diff --cc mm/util.c
Simple merge
diff --cc net/sunrpc/auth.c
Simple merge
diff --cc tools/testing/selftests/clone3/clone3.c

index 9429d361059e0a07cf0d34c872c99903bdb347be,1c61e3c022cb84e3b6f1b49bafd70af12fa5239d..3c9bf0cd82a80dfe4189e273efbd9693f4f61b22
--- 1/tools/testing/selftests/clone3/clone3.c
--- 2/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@@ -115,210 -114,97 +115,223 @@@ static bool test_clone3(uint64_t flags
         ret = call_clone3(flags, size, test_mode);
         ksft_print_msg("[%d] clone3() with flags says: %d expected %d\n",
                         getpid(), ret, expected);
- -      if (ret != expected)
- -              ksft_test_result_fail(
+ +      if (ret != expected) {
+ +              ksft_print_msg(
                         "[%d] Result (%d) is different than expected (%d)\n",
                         getpid(), ret, expected);
- -      else
- -              ksft_test_result_pass(
- -                      "[%d] Result (%d) matches expectation (%d)\n",
- -                      getpid(), ret, expected);
- -}
- -
- -int main(int argc, char *argv[])
- -{
- -      uid_t uid = getuid();
- -
- -      ksft_print_header();
- -      ksft_set_plan(19);
- -      test_clone3_supported();
- -
- -      /* Just a simple clone3() should return 0.*/
- -      test_clone3(0, 0, 0, CLONE3_ARGS_NO_TEST);
- -
- -      /* Do a clone3() in a new PID NS.*/
- -      if (uid == 0)
- -              test_clone3(CLONE_NEWPID, 0, 0, CLONE3_ARGS_NO_TEST);
- -      else
- -              ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n");
+ +              return false;
+ +      }
   
- -      /* Do a clone3() with CLONE_ARGS_SIZE_VER0. */
- -      test_clone3(0, CLONE_ARGS_SIZE_VER0, 0, CLONE3_ARGS_NO_TEST);
+ +      return true;
+ +}
   
- -      /* Do a clone3() with CLONE_ARGS_SIZE_VER0 - 8 */
- -      test_clone3(0, CLONE_ARGS_SIZE_VER0 - 8, -EINVAL, CLONE3_ARGS_NO_TEST);
+ +typedef bool (*filter_function)(void);
+ +typedef size_t (*size_function)(void);
   
- -      /* Do a clone3() with sizeof(struct clone_args) + 8 */
- -      test_clone3(0, sizeof(struct __clone_args) + 8, 0, CLONE3_ARGS_NO_TEST);
+ +static bool not_root(void)
+ +{
+ +      if (getuid() != 0) {
+ +              ksft_print_msg("Not running as root\n");
+ +              return true;
+ +      }
   
- -      /* Do a clone3() with exit_signal having highest 32 bits non-zero */
- -      test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG);
+ +      return false;
+ +}
   
- -      /* Do a clone3() with negative 32-bit exit_signal */
- -      test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG);
++static bool no_timenamespace(void)
++{
++      if (not_root())
++              return true;
+ 
- -      /* Do a clone3() with exit_signal not fitting into CSIGNAL mask */
- -      test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG);
++      if (!access("/proc/self/ns/time", F_OK))
++              return false;
+ 
- -      /* Do a clone3() with NSIG < exit_signal < CSIG */
- -      test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG);
++      ksft_print_msg("Time namespaces are not supported\n");
++      return true;
++}
+ 
- -      test_clone3(0, sizeof(struct __clone_args) + 8, 0, CLONE3_ARGS_ALL_0);
+ +static size_t page_size_plus_8(void)
+ +{
+ +      return getpagesize() + 8;
+ +}
   
- -      test_clone3(0, sizeof(struct __clone_args) + 16, -E2BIG,
- -                      CLONE3_ARGS_ALL_0);
+ +struct test {
+ +      const char *name;
+ +      uint64_t flags;
+ +      size_t size;
+ +      size_function size_function;
+ +      int expected;
+ +      enum test_mode test_mode;
+ +      filter_function filter;
+ +};
   
- -      test_clone3(0, sizeof(struct __clone_args) * 2, -E2BIG,
- -                      CLONE3_ARGS_ALL_0);
+ +static const struct test tests[] = {
+ +      {
+ +              .name = "simple clone3()",
+ +              .flags = 0,
+ +              .size = 0,
+ +              .expected = 0,
+ +              .test_mode = CLONE3_ARGS_NO_TEST,
+ +      },
+ +      {
+ +              .name = "clone3() in a new PID_NS",
+ +              .flags = CLONE_NEWPID,
+ +              .size = 0,
+ +              .expected = 0,
+ +              .test_mode = CLONE3_ARGS_NO_TEST,
+ +              .filter = not_root,
+ +      },
+ +      {
+ +              .name = "CLONE_ARGS_SIZE_VER0",
+ +              .flags = 0,
+ +              .size = CLONE_ARGS_SIZE_VER0,
+ +              .expected = 0,
+ +              .test_mode = CLONE3_ARGS_NO_TEST,
+ +      },
+ +      {
+ +              .name = "CLONE_ARGS_SIZE_VER0 - 8",
+ +              .flags = 0,
+ +              .size = CLONE_ARGS_SIZE_VER0 - 8,
+ +              .expected = -EINVAL,
+ +              .test_mode = CLONE3_ARGS_NO_TEST,
+ +      },
+ +      {
+ +              .name = "sizeof(struct clone_args) + 8",
+ +              .flags = 0,
+ +              .size = sizeof(struct __clone_args) + 8,
+ +              .expected = 0,
+ +              .test_mode = CLONE3_ARGS_NO_TEST,
+ +      },
+ +      {
+ +              .name = "exit_signal with highest 32 bits non-zero",
+ +              .flags = 0,
+ +              .size = 0,
+ +              .expected = -EINVAL,
+ +              .test_mode = CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG,
+ +      },
+ +      {
+ +              .name = "negative 32-bit exit_signal",
+ +              .flags = 0,
+ +              .size = 0,
+ +              .expected = -EINVAL,
+ +              .test_mode = CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG,
+ +      },
+ +      {
+ +              .name = "exit_signal not fitting into CSIGNAL mask",
+ +              .flags = 0,
+ +              .size = 0,
+ +              .expected = -EINVAL,
+ +              .test_mode = CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG,
+ +      },
+ +      {
+ +              .name = "NSIG < exit_signal < CSIG",
+ +              .flags = 0,
+ +              .size = 0,
+ +              .expected = -EINVAL,
+ +              .test_mode = CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG,
+ +      },
+ +      {
+ +              .name = "Arguments sizeof(struct clone_args) + 8",
+ +              .flags = 0,
+ +              .size = sizeof(struct __clone_args) + 8,
+ +              .expected = 0,
+ +              .test_mode = CLONE3_ARGS_ALL_0,
+ +      },
+ +      {
+ +              .name = "Arguments sizeof(struct clone_args) + 16",
+ +              .flags = 0,
+ +              .size = sizeof(struct __clone_args) + 16,
+ +              .expected = -E2BIG,
+ +              .test_mode = CLONE3_ARGS_ALL_0,
+ +      },
+ +      {
+ +              .name = "Arguments sizeof(struct clone_arg) * 2",
+ +              .flags = 0,
+ +              .size = sizeof(struct __clone_args) + 16,
+ +              .expected = -E2BIG,
+ +              .test_mode = CLONE3_ARGS_ALL_0,
+ +      },
+ +      {
+ +              .name = "Arguments > page size",
+ +              .flags = 0,
+ +              .size_function = page_size_plus_8,
+ +              .expected = -E2BIG,
+ +              .test_mode = CLONE3_ARGS_NO_TEST,
+ +      },
+ +      {
+ +              .name = "CLONE_ARGS_SIZE_VER0 in a new PID NS",
+ +              .flags = CLONE_NEWPID,
+ +              .size = CLONE_ARGS_SIZE_VER0,
+ +              .expected = 0,
+ +              .test_mode = CLONE3_ARGS_NO_TEST,
+ +              .filter = not_root,
+ +      },
+ +      {
+ +              .name = "CLONE_ARGS_SIZE_VER0 - 8 in a new PID NS",
+ +              .flags = CLONE_NEWPID,
+ +              .size = CLONE_ARGS_SIZE_VER0 - 8,
+ +              .expected = -EINVAL,
+ +              .test_mode = CLONE3_ARGS_NO_TEST,
+ +      },
+ +      {
+ +              .name = "sizeof(struct clone_args) + 8 in a new PID NS",
+ +              .flags = CLONE_NEWPID,
+ +              .size = sizeof(struct __clone_args) + 8,
+ +              .expected = 0,
+ +              .test_mode = CLONE3_ARGS_NO_TEST,
+ +              .filter = not_root,
+ +      },
+ +      {
+ +              .name = "Arguments > page size in a new PID NS",
+ +              .flags = CLONE_NEWPID,
+ +              .size_function = page_size_plus_8,
+ +              .expected = -E2BIG,
+ +              .test_mode = CLONE3_ARGS_NO_TEST,
+ +      },
+ +      {
+ +              .name = "New time NS",
+ +              .flags = CLONE_NEWTIME,
+ +              .size = 0,
+ +              .expected = 0,
+ +              .test_mode = CLONE3_ARGS_NO_TEST,
++              .filter = no_timenamespace,
+ +      },
+ +      {
+ +              .name = "exit signal (SIGCHLD) in flags",
+ +              .flags = SIGCHLD,
+ +              .size = 0,
+ +              .expected = -EINVAL,
+ +              .test_mode = CLONE3_ARGS_NO_TEST,
+ +      },
+ +};
   
- -      /* Do a clone3() with > page size */
- -      test_clone3(0, getpagesize() + 8, -E2BIG, CLONE3_ARGS_NO_TEST);
+ +int main(int argc, char *argv[])
+ +{
+ +      size_t size;
+ +      int i;
   
- -      /* Do a clone3() with CLONE_ARGS_SIZE_VER0 in a new PID NS. */
- -      if (uid == 0)
- -              test_clone3(CLONE_NEWPID, CLONE_ARGS_SIZE_VER0, 0,
- -                              CLONE3_ARGS_NO_TEST);
- -      else
- -              ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n");
+ +      ksft_print_header();
+ +      ksft_set_plan(ARRAY_SIZE(tests));
+ +      test_clone3_supported();
   
- -      /* Do a clone3() with CLONE_ARGS_SIZE_VER0 - 8 in a new PID NS */
- -      test_clone3(CLONE_NEWPID, CLONE_ARGS_SIZE_VER0 - 8, -EINVAL,
- -                      CLONE3_ARGS_NO_TEST);
+ +      for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ +              if (tests[i].filter && tests[i].filter()) {
+ +                      ksft_test_result_skip("%s\n", tests[i].name);
+ +                      continue;
+ +              }
   
- -      /* Do a clone3() with sizeof(struct clone_args) + 8 in a new PID NS */
- -      if (uid == 0)
- -              test_clone3(CLONE_NEWPID, sizeof(struct __clone_args) + 8, 0,
- -                              CLONE3_ARGS_NO_TEST);
- -      else
- -              ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n");
+ +              if (tests[i].size_function)
+ +                      size = tests[i].size_function();
+ +              else
+ +                      size = tests[i].size;
   
- -      /* Do a clone3() with > page size in a new PID NS */
- -      test_clone3(CLONE_NEWPID, getpagesize() + 8, -E2BIG,
- -                      CLONE3_ARGS_NO_TEST);
+ +              ksft_print_msg("Running test '%s'\n", tests[i].name);
   
- -      /* Do a clone3() in a new time namespace */
- -      if (access("/proc/self/ns/time", F_OK) == 0) {
- -              test_clone3(CLONE_NEWTIME, 0, 0, CLONE3_ARGS_NO_TEST);
- -      } else {
- -              ksft_print_msg("Time namespaces are not supported\n");
- -              ksft_test_result_skip("Skipping clone3() with CLONE_NEWTIME\n");
+ +              ksft_test_result(test_clone3(tests[i].flags, size,
+ +                                           tests[i].expected,
+ +                                           tests[i].test_mode),
+ +                               "%s\n", tests[i].name);
         }
   
- -      /* Do a clone3() with exit signal (SIGCHLD) in flags */
- -      test_clone3(SIGCHLD, 0, -EINVAL, CLONE3_ARGS_NO_TEST);
- -
         ksft_finished();
   }
diff --cc tools/testing/selftests/damon/sysfs.sh

index 60a9a305aef071ee7aad5210fe456701d240c28a,56f0230a8b92d37a4eb499ba69062f16393b02d4..56f0230a8b92d37a4eb499ba69062f16393b02d4

mode 100755,100644..100755
--- 1/tools/testing/selftests/damon/sysfs.sh
--- 2/tools/testing/selftests/damon/sysfs.sh
+++ b/tools/testing/selftests/damon/sysfs.sh
diff --cc tools/testing/selftests/mm/mremap_test.c
Simple merge
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 3 Nov 2023 05:38:47 +0000 (19:38 -1000)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 3 Nov 2023 05:38:47 +0000 (19:38 -1000)
		1	2
Documentation/admin-guide/cgroup-v2.rst	patch \|	diff1 \|	diff2 \|	blob \| history
MAINTAINERS	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/kernel/mte.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/bitops.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/mmu/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/acpi/acpi_pad.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/firmware/efi/unaccepted_memory.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/i915/gem/i915_gem_shrinker.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/i915/i915_drv.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/msm/msm_drv.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/msm/msm_drv.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/panfrost/panfrost_device.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/panfrost/panfrost_drv.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/panfrost/panfrost_gem.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/bcache/bcache.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-cache-metadata.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid5.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/virtio/virtio_balloon.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/bcachefs/btree_cache.c	patch \|	diff1 \|	\|	blob \| history
fs/bcachefs/btree_key_cache.c	patch \|	diff1 \|	\|	blob \| history
fs/bcachefs/btree_types.h	patch \|	diff1 \|	\|	blob \| history
fs/bcachefs/fs.c	patch \|	diff1 \|	\|	blob \| history
fs/bcachefs/sysfs.c	patch \|	diff1 \|	\|	blob \| history
fs/btrfs/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/erofs/utils.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/ext4.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/extents_status.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/f2fs/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/gfs2/bmap.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/gfs2/glock.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/gfs2/quota.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/hugetlbfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/iomap/buffered-io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfsd/filecache.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfsd/nfs4state.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ntfs3/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/aops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc/task_mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/quota/dquot.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/reiserfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ubifs/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ufs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_buf.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_buf.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/cgroup-defs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched/numa_balancing.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/exit.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/mempolicy.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/mmap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/nommu.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/percpu.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/shmem.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/util.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/sunrpc/auth.c	patch \|	diff1 \|	diff2 \|	blob \| history
tools/testing/selftests/clone3/clone3.c	patch \|	diff1 \|	diff2 \|	blob \| history
tools/testing/selftests/damon/sysfs.sh	patch \|	diff1 \|	diff2 \|	blob \| history
tools/testing/selftests/mm/mremap_test.c	patch \|	diff1 \|	diff2 \|	blob \| history