futex: Use RCU-based per-CPU reference counting instead of rcuref_t

author Peter Zijlstra <peterz@infradead.org>

Thu, 10 Jul 2025 11:00:07 +0000 (13:00 +0200)

committer Peter Zijlstra <peterz@infradead.org>

Fri, 11 Jul 2025 14:02:00 +0000 (16:02 +0200)
author Peter Zijlstra <peterz@infradead.org>
Thu, 10 Jul 2025 11:00:07 +0000 (13:00 +0200)
committer Peter Zijlstra <peterz@infradead.org>
Fri, 11 Jul 2025 14:02:00 +0000 (16:02 +0200)
diff --git a/include/linux/futex.h b/include/linux/futex.h

index b37193653e6b5d7d562ac08f93d4ee41eb8e72a2..9e9750f04980548392b745c09da3a43619fda321 100644 (file)
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -85,18 +85,12 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
  #ifdef CONFIG_FUTEX_PRIVATE_HASH
  int futex_hash_allocate_default(void);
  void futex_hash_free(struct mm_struct *mm);
-
-static inline void futex_mm_init(struct mm_struct *mm)
-{
-       RCU_INIT_POINTER(mm->futex_phash, NULL);
-       mm->futex_phash_new = NULL;
-       mutex_init(&mm->futex_hash_lock);
-}
+int futex_mm_init(struct mm_struct *mm);
  
  #else /* !CONFIG_FUTEX_PRIVATE_HASH */
  static inline int futex_hash_allocate_default(void) { return 0; }
-static inline void futex_hash_free(struct mm_struct *mm) { }
-static inline void futex_mm_init(struct mm_struct *mm) { }
+static inline int futex_hash_free(struct mm_struct *mm) { return 0; }
+static inline int futex_mm_init(struct mm_struct *mm) { return 0; }
  #endif /* CONFIG_FUTEX_PRIVATE_HASH */
  
  #else /* !CONFIG_FUTEX */
@@ -118,8 +112,8 @@ static inline int futex_hash_allocate_default(void)
  {
         return 0;
  }
-static inline void futex_hash_free(struct mm_struct *mm) { }
-static inline void futex_mm_init(struct mm_struct *mm) { }
+static inline int futex_hash_free(struct mm_struct *mm) { return 0; }
+static inline int futex_mm_init(struct mm_struct *mm) { return 0; }
  
  #endif
  
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index d6b91e8a66d6d310b1e10b43effbf3e600c4f8e5..0f0662157066a5b874084a9c8aca5050e3c1cc85 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1070,6 +1070,11 @@ struct mm_struct {
                 struct mutex                    futex_hash_lock;
                 struct futex_private_hash       __rcu *futex_phash;
                 struct futex_private_hash       *futex_phash_new;
+               /* futex-ref */
+               unsigned long                   futex_batches;
+               struct rcu_head                 futex_rcu;
+               atomic_long_t                   futex_atomic;
+               unsigned int                    __percpu *futex_ref;
  #endif
  
                 unsigned long hiwater_rss; /* High-watermark of RSS usage */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h

index b13474825130fd1809c46ffa1875eea782be1ca8..2201da0afecc500702687ce60b436bbfad55687b 100644 (file)
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -140,7 +140,7 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
  
  /* mmput gets rid of the mappings and all user-space */
  extern void mmput(struct mm_struct *);
-#ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
  /* same as above but performs the slow path from the async context. Can
   * be called from the atomic context as well
   */
diff --git a/init/Kconfig b/init/Kconfig

index 666783eb50abd7de2edaf5e367b0750a693fefbb..af4c2f0854554bbcdf193852cf5c1d2c2accc64f 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1716,13 +1716,9 @@ config FUTEX_PI
         depends on FUTEX && RT_MUTEXES
         default y
  
-#
-# marked broken for performance reasons; gives us one more cycle to sort things out.
-#
  config FUTEX_PRIVATE_HASH
         bool
         depends on FUTEX && !BASE_SMALL && MMU
-       depends on BROKEN
         default y
  
  config FUTEX_MPOL
diff --git a/kernel/fork.c b/kernel/fork.c

index 1ee8eb11f38bae1d2eb6de9494aea94b7a19e6c3..0b885dcbde9af18ed3517d5d37ce0e1471ee8161 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1046,7 +1046,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
         RCU_INIT_POINTER(mm->exe_file, NULL);
         mmu_notifier_subscriptions_init(mm);
         init_tlb_flush_pending(mm);
-       futex_mm_init(mm);
  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
         mm->pmd_huge_pte = NULL;
  #endif
@@ -1061,6 +1060,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
                 mm->def_flags = 0;
         }
  
+       if (futex_mm_init(mm))
+               goto fail_mm_init;
+
         if (mm_alloc_pgd(mm))
                 goto fail_nopgd;
  
@@ -1090,6 +1092,8 @@ fail_nocontext:
  fail_noid:
         mm_free_pgd(mm);
  fail_nopgd:
+       futex_hash_free(mm);
+fail_mm_init:
         free_mm(mm);
         return NULL;
  }
@@ -1145,7 +1149,7 @@ void mmput(struct mm_struct *mm)
  }
  EXPORT_SYMBOL_GPL(mmput);
  
-#ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
  static void mmput_async_fn(struct work_struct *work)
  {
         struct mm_struct *mm = container_of(work, struct mm_struct,
diff --git a/kernel/futex/core.c b/kernel/futex/core.c

index 90d53fb0ee9e1563c355ef0499df441367e1a46c..1dcb4c8a2585dc29fd2d30dd5e3c9999c922bf58 100644 (file)
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -42,7 +42,6 @@
  #include <linux/fault-inject.h>
  #include <linux/slab.h>
  #include <linux/prctl.h>
-#include <linux/rcuref.h>
  #include <linux/mempolicy.h>
  #include <linux/mmap_lock.h>
  
@@ -65,7 +64,7 @@ static struct {
  #define futex_queues   (__futex_data.queues)
  
  struct futex_private_hash {
-       rcuref_t        users;
+       int             state;
         unsigned int    hash_mask;
         struct rcu_head rcu;
         void            *mm;
@@ -129,6 +128,12 @@ static struct futex_hash_bucket *
  __futex_hash(union futex_key *key, struct futex_private_hash *fph);
  
  #ifdef CONFIG_FUTEX_PRIVATE_HASH
+static bool futex_ref_get(struct futex_private_hash *fph);
+static bool futex_ref_put(struct futex_private_hash *fph);
+static bool futex_ref_is_dead(struct futex_private_hash *fph);
+
+enum { FR_PERCPU = 0, FR_ATOMIC };
+
  static inline bool futex_key_is_private(union futex_key *key)
  {
         /*
@@ -142,15 +147,14 @@ bool futex_private_hash_get(struct futex_private_hash *fph)
  {
         if (fph->immutable)
                 return true;
-       return rcuref_get(&fph->users);
+       return futex_ref_get(fph);
  }
  
  void futex_private_hash_put(struct futex_private_hash *fph)
  {
-       /* Ignore return value, last put is verified via rcuref_is_dead() */
         if (fph->immutable)
                 return;
-       if (rcuref_put(&fph->users))
+       if (futex_ref_put(fph))
                 wake_up_var(fph->mm);
  }
  
@@ -243,14 +247,18 @@ static bool __futex_pivot_hash(struct mm_struct *mm,
         fph = rcu_dereference_protected(mm->futex_phash,
                                         lockdep_is_held(&mm->futex_hash_lock));
         if (fph) {
-               if (!rcuref_is_dead(&fph->users)) {
+               if (!futex_ref_is_dead(fph)) {
                         mm->futex_phash_new = new;
                         return false;
                 }
  
                 futex_rehash_private(fph, new);
         }
-       rcu_assign_pointer(mm->futex_phash, new);
+       new->state = FR_PERCPU;
+       scoped_guard(rcu) {
+               mm->futex_batches = get_state_synchronize_rcu();
+               rcu_assign_pointer(mm->futex_phash, new);
+       }
         kvfree_rcu(fph, rcu);
         return true;
  }
@@ -289,9 +297,7 @@ again:
                 if (!fph)
                         return NULL;
  
-               if (fph->immutable)
-                       return fph;
-               if (rcuref_get(&fph->users))
+               if (futex_private_hash_get(fph))
                         return fph;
         }
         futex_pivot_hash(mm);
@@ -1527,16 +1533,219 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
  #define FH_IMMUTABLE   0x02
  
  #ifdef CONFIG_FUTEX_PRIVATE_HASH
+
+/*
+ * futex-ref
+ *
+ * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that
+ * code because it just doesn't fit right.
+ *
+ * Dual counter, per-cpu / atomic approach like percpu-refcount, except it
+ * re-initializes the state automatically, such that the fph swizzle is also a
+ * transition back to per-cpu.
+ */
+
+static void futex_ref_rcu(struct rcu_head *head);
+
+static void __futex_ref_atomic_begin(struct futex_private_hash *fph)
+{
+       struct mm_struct *mm = fph->mm;
+
+       /*
+        * The counter we're about to switch to must have fully switched;
+        * otherwise it would be impossible for it to have reported success
+        * from futex_ref_is_dead().
+        */
+       WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0);
+
+       /*
+        * Set the atomic to the bias value such that futex_ref_{get,put}()
+        * will never observe 0. Will be fixed up in __futex_ref_atomic_end()
+        * when folding in the percpu count.
+        */
+       atomic_long_set(&mm->futex_atomic, LONG_MAX);
+       smp_store_release(&fph->state, FR_ATOMIC);
+
+       call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+}
+
+static void __futex_ref_atomic_end(struct futex_private_hash *fph)
+{
+       struct mm_struct *mm = fph->mm;
+       unsigned int count = 0;
+       long ret;
+       int cpu;
+
+       /*
+        * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC
+        * and per this RCU callback, everybody must now observe this state and
+        * use the atomic variable.
+        */
+       WARN_ON_ONCE(fph->state != FR_ATOMIC);
+
+       /*
+        * Therefore the per-cpu counter is now stable, sum and reset.
+        */
+       for_each_possible_cpu(cpu) {
+               unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu);
+               count += *ptr;
+               *ptr = 0;
+       }
+
+       /*
+        * Re-init for the next cycle.
+        */
+       this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+
+       /*
+        * Add actual count, subtract bias and initial refcount.
+        *
+        * The moment this atomic operation happens, futex_ref_is_dead() can
+        * become true.
+        */
+       ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic);
+       if (!ret)
+               wake_up_var(mm);
+
+       WARN_ON_ONCE(ret < 0);
+       mmput_async(mm);
+}
+
+static void futex_ref_rcu(struct rcu_head *head)
+{
+       struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu);
+       struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash);
+
+       if (fph->state == FR_PERCPU) {
+               /*
+                * Per this extra grace-period, everybody must now observe
+                * fph as the current fph and no previously observed fph's
+                * are in-flight.
+                *
+                * Notably, nobody will now rely on the atomic
+                * futex_ref_is_dead() state anymore so we can begin the
+                * migration of the per-cpu counter into the atomic.
+                */
+               __futex_ref_atomic_begin(fph);
+               return;
+       }
+
+       __futex_ref_atomic_end(fph);
+}
+
+/*
+ * Drop the initial refcount and transition to atomics.
+ */
+static void futex_ref_drop(struct futex_private_hash *fph)
+{
+       struct mm_struct *mm = fph->mm;
+
+       /*
+        * Can only transition the current fph;
+        */
+       WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph);
+       /*
+        * We enqueue at least one RCU callback. Ensure mm stays if the task
+        * exits before the transition is completed.
+        */
+       mmget(mm);
+
+       /*
+        * In order to avoid the following scenario:
+        *
+        * futex_hash()                 __futex_pivot_hash()
+        *   guard(rcu);                  guard(mm->futex_hash_lock);
+        *   fph = mm->futex_phash;
+        *                                rcu_assign_pointer(&mm->futex_phash, new);
+        *                              futex_hash_allocate()
+        *                                futex_ref_drop()
+        *                                  fph->state = FR_ATOMIC;
+        *                                  atomic_set(, BIAS);
+        *
+        *   futex_private_hash_get(fph); // OOPS
+        *
+        * Where an old fph (which is FR_ATOMIC) and should fail on
+        * inc_not_zero, will succeed because a new transition is started and
+        * the atomic is bias'ed away from 0.
+        *
+        * There must be at least one full grace-period between publishing a
+        * new fph and trying to replace it.
+        */
+       if (poll_state_synchronize_rcu(mm->futex_batches)) {
+               /*
+                * There was a grace-period, we can begin now.
+                */
+               __futex_ref_atomic_begin(fph);
+               return;
+       }
+
+       call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+}
+
+static bool futex_ref_get(struct futex_private_hash *fph)
+{
+       struct mm_struct *mm = fph->mm;
+
+       guard(rcu)();
+
+       if (smp_load_acquire(&fph->state) == FR_PERCPU) {
+               this_cpu_inc(*mm->futex_ref);
+               return true;
+       }
+
+       return atomic_long_inc_not_zero(&mm->futex_atomic);
+}
+
+static bool futex_ref_put(struct futex_private_hash *fph)
+{
+       struct mm_struct *mm = fph->mm;
+
+       guard(rcu)();
+
+       if (smp_load_acquire(&fph->state) == FR_PERCPU) {
+               this_cpu_dec(*mm->futex_ref);
+               return false;
+       }
+
+       return atomic_long_dec_and_test(&mm->futex_atomic);
+}
+
+static bool futex_ref_is_dead(struct futex_private_hash *fph)
+{
+       struct mm_struct *mm = fph->mm;
+
+       guard(rcu)();
+
+       if (smp_load_acquire(&fph->state) == FR_PERCPU)
+               return false;
+
+       return atomic_long_read(&mm->futex_atomic) == 0;
+}
+
+int futex_mm_init(struct mm_struct *mm)
+{
+       mutex_init(&mm->futex_hash_lock);
+       RCU_INIT_POINTER(mm->futex_phash, NULL);
+       mm->futex_phash_new = NULL;
+       /* futex-ref */
+       atomic_long_set(&mm->futex_atomic, 0);
+       mm->futex_batches = get_state_synchronize_rcu();
+       mm->futex_ref = alloc_percpu(unsigned int);
+       if (!mm->futex_ref)
+               return -ENOMEM;
+       this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+       return 0;
+}
+
  void futex_hash_free(struct mm_struct *mm)
  {
         struct futex_private_hash *fph;
  
+       free_percpu(mm->futex_ref);
         kvfree(mm->futex_phash_new);
         fph = rcu_dereference_raw(mm->futex_phash);
-       if (fph) {
-               WARN_ON_ONCE(rcuref_read(&fph->users) > 1);
+       if (fph)
                 kvfree(fph);
-       }
  }
  
  static bool futex_pivot_pending(struct mm_struct *mm)
@@ -1549,7 +1758,7 @@ static bool futex_pivot_pending(struct mm_struct *mm)
                 return true;
  
         fph = rcu_dereference(mm->futex_phash);
-       return rcuref_is_dead(&fph->users);
+       return futex_ref_is_dead(fph);
  }
  
  static bool futex_hash_less(struct futex_private_hash *a,
@@ -1598,11 +1807,11 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
                 }
         }
  
-       fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+       fph = kvzalloc(struct_size(fph, queues, hash_slots),
+                      GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
         if (!fph)
                 return -ENOMEM;
  
-       rcuref_init(&fph->users, 1);
         fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
         fph->custom = custom;
         fph->immutable = !!(flags & FH_IMMUTABLE);
@@ -1645,7 +1854,7 @@ again:
                                  * allocated a replacement hash, drop the initial
                                  * reference on the existing hash.
                                  */
-                               futex_private_hash_put(cur);
+                               futex_ref_drop(cur);
                         }
  
                         if (new) {
author	Peter Zijlstra <peterz@infradead.org>
	Thu, 10 Jul 2025 11:00:07 +0000 (13:00 +0200)
committer	Peter Zijlstra <peterz@infradead.org>
	Fri, 11 Jul 2025 14:02:00 +0000 (16:02 +0200)
include/linux/futex.h		patch \| blob \| blame \| history
include/linux/mm_types.h		patch \| blob \| blame \| history
include/linux/sched/mm.h		patch \| blob \| blame \| history
init/Kconfig		patch \| blob \| blame \| history
kernel/fork.c		patch \| blob \| blame \| history
kernel/futex/core.c		patch \| blob \| blame \| history