]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
futex: Optimize futex hash bucket access patterns
authorPeter Zijlstra <peterz@infradead.org>
Wed, 10 Jun 2026 21:20:09 +0000 (23:20 +0200)
committerPeter Zijlstra <peterz@infradead.org>
Thu, 11 Jun 2026 11:41:24 +0000 (13:41 +0200)
Breno reported significant c2c HITM in a futex hash heavy workload.

It turns out that the hash bucket to private hash table reverse pointer
(futex_hash_bucket::priv) was to blame. Notably when the hash buckets are
heavily contended, the: 'fph = bh->priv;' load in futex_hash() will typically
miss and consequently become quite expensive.

Since this load in particular is quite superfluous, removing it is fairly
straight forward. However, removing it does not in fact achieve anything much.
The pain moves to the next user, notably: futex_hash_put().

Therefore rework the whole private hash refcounting to avoid needing this back
pointer (and removing it). Instead of passing around 'struct futex_hash_bucket
*hb', pass around a new structure that contains it and the related 'struct
futex_private_hash *fph' pointer in tandem.

Funnily this turns out to remove more code than it adds and significantly
improves futex hash performance (as measured by 'perf bench futex hash'):

SKL dual socket 112 threads:

  Baseline        Patched
  shared (16k)    1571857         1641435         + 4.4%
  autosize (512)   646390          903371         +39.7%
  -b 256           464395          587014         +26.4%
  -b 512           715687          995943         +39.2%
  -b 1024          995085         1396328         +40.3%
  -b 2048         1293114         1668395         +29.0%
  -b 4096         2124438         2240228         + 5.5%

Zen3 dual socket 256 threads:

                  Baseline        Patched
  shared  (16k)   1275840         1381279         + 8.2%
  autosize (512)  1252745         1482179         +18.3%
  -b 256           856274          955455         +11.5%
  -b 512          1267490         1544010         +21.8%
  -b 1024         1424013         1625424         +14.1%
  -b 2048         1505181         1669342         +10.9%
  -b 4096         1465993         1688932         +15.2%

AMD EPYC 9D64 (Zen4, single socket) 176 threads:

                       Baseline       Patched      Delta
  shared (16k)         1,230,599      1,368,655    +11.2%
  autosize (1024)      1,285,440      1,556,946    +21.1%
  -b 256               1,341,471      1,520,303    +13.3%
  -b 512               1,438,330      1,599,319    +11.2%
  -b 1024              1,443,772      1,622,493    +12.4%
  -b 2048              1,472,108      1,643,975    +11.7%
  -b 4096              1,333,098      1,570,897    +17.8%

Reported-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Breno Leitao <leitao@debian.org>
Tested-by: Thomas Gleixner <tglx@kernel.org>
Link: https://patch.msgid.link/20260610135510.GB1430057@noisy.programming.kicks-ass.net
kernel/futex/core.c
kernel/futex/futex.h
kernel/futex/pi.c
kernel/futex/requeue.c
kernel/futex/waitwake.c

index 6ea4a97796a1a70f1d01ba45e5bcd65653359440..179b26e9c934166aaade6939ef80c306da76727a 100644 (file)
@@ -127,7 +127,7 @@ late_initcall(fail_futex_debugfs);
 #endif /* CONFIG_FAIL_FUTEX */
 
 static struct futex_hash_bucket *
-__futex_hash(union futex_key *key, struct futex_private_hash *fph);
+__futex_hash(union futex_key *key, struct futex_private_hash *fph, struct futex_private_hash **fph_p);
 
 #ifdef CONFIG_FUTEX_PRIVATE_HASH
 static bool futex_ref_get(struct futex_private_hash *fph);
@@ -136,15 +136,6 @@ static bool futex_ref_is_dead(struct futex_private_hash *fph);
 
 enum { FR_PERCPU = 0, FR_ATOMIC };
 
-static inline bool futex_key_is_private(union futex_key *key)
-{
-       /*
-        * Relies on get_futex_key() to set either bit for shared
-        * futexes -- see comment with union futex_key.
-        */
-       return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
-}
-
 static bool futex_private_hash_get(struct futex_private_hash *fph)
 {
        return futex_ref_get(fph);
@@ -152,48 +143,15 @@ static bool futex_private_hash_get(struct futex_private_hash *fph)
 
 void futex_private_hash_put(struct futex_private_hash *fph)
 {
-       if (futex_ref_put(fph))
+       if (fph && futex_ref_put(fph))
                wake_up_var(fph->mm);
 }
 
-/**
- * futex_hash_get - Get an additional reference for the local hash.
- * @hb:                    ptr to the private local hash.
- *
- * Obtain an additional reference for the already obtained hash bucket. The
- * caller must already own an reference.
- */
-void futex_hash_get(struct futex_hash_bucket *hb)
-{
-       struct futex_private_hash *fph = hb->priv;
-
-       if (!fph)
-               return;
-       WARN_ON_ONCE(!futex_private_hash_get(fph));
-}
-
-void futex_hash_put(struct futex_hash_bucket *hb)
-{
-       struct futex_private_hash *fph = hb->priv;
-
-       if (!fph)
-               return;
-       futex_private_hash_put(fph);
-}
-
 static struct futex_hash_bucket *
 __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
 {
        u32 hash;
 
-       if (!futex_key_is_private(key))
-               return NULL;
-
-       if (!fph)
-               fph = rcu_dereference(key->private.mm->futex.phash.hash);
-       if (!fph || !fph->hash_mask)
-               return NULL;
-
        hash = jhash2((void *)&key->private.address, sizeof(key->private.address) / 4,
                      key->both.offset);
 
@@ -214,13 +172,12 @@ static void futex_rehash_private(struct futex_private_hash *old,
 
                spin_lock(&hb_old->lock);
                plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
-
                        plist_del(&this->list, &hb_old->chain);
                        futex_hb_waiters_dec(hb_old);
 
                        WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);
 
-                       hb_new = __futex_hash(&this->key, new);
+                       hb_new = __futex_hash(&this->key, new, NULL);
                        futex_hb_waiters_inc(hb_new);
                        /*
                         * The new pointer isn't published yet but an already
@@ -273,9 +230,8 @@ static void futex_pivot_hash(struct mm_struct *mm)
        }
 }
 
-struct futex_private_hash *futex_private_hash(void)
+struct futex_private_hash *futex_private_hash(struct mm_struct *mm)
 {
-       struct mm_struct *mm = current->mm;
        /*
         * Ideally we don't loop. If there is a replacement in progress
         * then a new private hash is already prepared and a reference can't be
@@ -301,18 +257,17 @@ again:
        goto again;
 }
 
-struct futex_hash_bucket *futex_hash(union futex_key *key)
+struct futex_bucket_ref futex_hash(union futex_key *key)
 {
-       struct futex_private_hash *fph;
-       struct futex_hash_bucket *hb;
-
 again:
        scoped_guard(rcu) {
-               hb = __futex_hash(key, NULL);
-               fph = hb->priv;
+               struct futex_private_hash *fph = NULL;
+               struct futex_hash_bucket *hb;
+
+               hb = __futex_hash(key, NULL, &fph);
 
                if (!fph || futex_private_hash_get(fph))
-                       return hb;
+                       return (struct futex_bucket_ref){ .hb = hb, .fph = fph };
        }
        futex_pivot_hash(key->private.mm);
        goto again;
@@ -320,15 +275,9 @@ again:
 
 #else /* !CONFIG_FUTEX_PRIVATE_HASH */
 
-static struct futex_hash_bucket *
-__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
-{
-       return NULL;
-}
-
-struct futex_hash_bucket *futex_hash(union futex_key *key)
+struct futex_bucket_ref futex_hash(union futex_key *key)
 {
-       return __futex_hash(key, NULL);
+       return (struct futex_bucket_ref){ .hb = __futex_hash(key, NULL, NULL), .fph = NULL };
 }
 
 #endif /* CONFIG_FUTEX_PRIVATE_HASH */
@@ -406,6 +355,8 @@ static int futex_mpol(struct mm_struct *mm, unsigned long addr)
  * __futex_hash - Return the hash bucket
  * @key:       Pointer to the futex key for which the hash is calculated
  * @fph:       Pointer to private hash if known
+ * @fph_p:     Pointer to a private hash pointer; output for the private hash
+ *              used when set.
  *
  * We hash on the keys returned from get_futex_key (see below) and return the
  * corresponding hash bucket.
@@ -413,18 +364,23 @@ static int futex_mpol(struct mm_struct *mm, unsigned long addr)
  * private hash) is returned if existing. Otherwise a hash bucket from the
  * global hash is returned.
  */
-static struct futex_hash_bucket *__futex_hash(union futex_key *key, struct futex_private_hash *fph)
+static struct futex_hash_bucket *
+__futex_hash(union futex_key *key, struct futex_private_hash *fph, struct futex_private_hash **fph_p)
 {
        int node = key->both.node;
        u32 hash;
 
-       if (node == FUTEX_NO_NODE) {
-               struct futex_hash_bucket *hb;
-
-               hb = __futex_hash_private(key, fph);
-               if (hb)
-                       return hb;
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+       if (node == FUTEX_NO_NODE && futex_key_is_private(key)) {
+               if (!fph)
+                       fph = rcu_dereference(key->private.mm->futex.phash.hash);
+               if (fph && fph->hash_mask) {
+                       if (fph_p)
+                               *fph_p = fph;
+                       return __futex_hash_private(key, fph);
+               }
        }
+#endif
 
        hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / sizeof(u32),
                      key->both.offset);
@@ -1367,7 +1323,7 @@ static void exit_pi_state_list(struct task_struct *curr)
         * on the mutex.
         */
        WARN_ON(curr != current);
-       guard(private_hash)();
+       guard(private_hash)(current->mm);
        /*
         * We are a ZOMBIE and nobody can enqueue itself on
         * pi_state_list anymore, but we have to be careful
@@ -1379,7 +1335,8 @@ static void exit_pi_state_list(struct task_struct *curr)
                pi_state = list_entry(next, struct futex_pi_state, list);
                key = pi_state->key;
                if (1) {
-                       CLASS(hb, hb)(&key);
+                       CLASS(hbr, hbr)(&key);
+                       auto hb = hbr.hb;
 
                        /*
                         * We can race against put_pi_state() removing itself from the
@@ -1576,12 +1533,8 @@ void futex_exit_release(struct task_struct *tsk)
        futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
 }
 
-static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
-                                  struct futex_private_hash *fph)
+static void futex_hash_bucket_init(struct futex_hash_bucket *fhb)
 {
-#ifdef CONFIG_FUTEX_PRIVATE_HASH
-       fhb->priv = fph;
-#endif
        atomic_set(&fhb->waiters, 0);
        plist_head_init(&fhb->chain);
        spin_lock_init(&fhb->lock);
@@ -1877,7 +1830,7 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
        fph->mm = mm;
 
        for (i = 0; i < hash_slots; i++)
-               futex_hash_bucket_init(&fph->queues[i], fph);
+               futex_hash_bucket_init(&fph->queues[i]);
 
        if (custom) {
                /*
@@ -2082,7 +2035,7 @@ static int __init futex_init(void)
                BUG_ON(!table);
 
                for (i = 0; i < hashsize; i++)
-                       futex_hash_bucket_init(&table[i], NULL);
+                       futex_hash_bucket_init(&table[i]);
 
                futex_queues[n] = table;
        }
index 79ef2c709c81e1360545b4a5cb5bc0b2363b0011..f00f0863ed4412eaa981422571b4157a9344821f 100644 (file)
@@ -134,6 +134,15 @@ static inline bool should_fail_futex(bool fshared)
 }
 #endif
 
+static inline bool futex_key_is_private(union futex_key *key)
+{
+       /*
+        * Relies on get_futex_key() to set either bit for shared
+        * futexes -- see comment with union futex_key.
+        */
+       return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
+}
+
 /*
  * Hash buckets are shared by all the futex_keys that hash to the same
  * location.  Each key may have multiple futex_q structures, one for each task
@@ -143,7 +152,6 @@ struct futex_hash_bucket {
        atomic_t waiters;
        spinlock_t lock;
        struct plist_head chain;
-       struct futex_private_hash *priv;
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -183,7 +191,7 @@ typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q);
  * @requeue_pi_key:    the requeue_pi target futex key
  * @bitset:            bitset for the optional bitmasked wakeup
  * @requeue_state:     State field for futex_requeue_pi()
- * @drop_hb_ref:       Waiter should drop the extra hash bucket reference if true
+ * @drop_fph:          Waiter should drop the extra private hash reference when set
  * @requeue_wait:      RCU wait for futex_requeue_pi() (RT only)
  *
  * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
@@ -210,7 +218,7 @@ struct futex_q {
        union futex_key *requeue_pi_key;
        u32 bitset;
        atomic_t requeue_state;
-       bool drop_hb_ref;
+       struct futex_private_hash *drop_fph;
 #ifdef CONFIG_PREEMPT_RT
        struct rcuwait requeue_wait;
 #endif
@@ -230,28 +238,29 @@ extern struct hrtimer_sleeper *
 futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
                  int flags, u64 range_ns);
 
-extern struct futex_hash_bucket *futex_hash(union futex_key *key);
-#ifdef CONFIG_FUTEX_PRIVATE_HASH
-extern void futex_hash_get(struct futex_hash_bucket *hb);
-extern void futex_hash_put(struct futex_hash_bucket *hb);
+struct futex_bucket_ref {
+       struct futex_hash_bucket *hb;
+       struct futex_private_hash *fph;
+};
 
-extern struct futex_private_hash *futex_private_hash(void);
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+extern struct futex_private_hash *futex_private_hash(struct mm_struct *mm);
 extern void futex_private_hash_put(struct futex_private_hash *fph);
 
 #else /* !CONFIG_FUTEX_PRIVATE_HASH */
-static inline void futex_hash_get(struct futex_hash_bucket *hb) { }
-static inline void futex_hash_put(struct futex_hash_bucket *hb) { }
-static inline struct futex_private_hash *futex_private_hash(void) { return NULL; }
+static inline struct futex_private_hash *futex_private_hash(struct mm_struct *mm) { return NULL; }
 static inline void futex_private_hash_put(struct futex_private_hash *fph) { }
 #endif
 
-DEFINE_CLASS(hb, struct futex_hash_bucket *,
-            if (_T) futex_hash_put(_T),
+extern struct futex_bucket_ref futex_hash(union futex_key *key);
+
+DEFINE_CLASS(hbr, struct futex_bucket_ref,
+            if (_T.fph) futex_private_hash_put(_T.fph),
             futex_hash(key), union futex_key *key);
 
 DEFINE_CLASS(private_hash, struct futex_private_hash *,
             if (_T) futex_private_hash_put(_T),
-            futex_private_hash(), void);
+            futex_private_hash(mm), struct mm_struct *mm);
 
 /**
  * futex_match - Check whether two futex keys are equal
index 9dd5c0b1ac6ab1407b5d287367690c7259a156d3..795011ea1202f13d804e237ce665826d4159836c 100644 (file)
@@ -945,7 +945,8 @@ retry:
 
 retry_private:
        if (1) {
-               CLASS(hb, hb)(&q.key);
+               CLASS(hbr, hbr)(&q.key);
+               auto hb = hbr.hb;
 
                futex_q_lock(&q, hb);
 
@@ -1009,7 +1010,7 @@ retry_private:
                 * the thread, performing resize, will block on hb->lock during
                 * the requeue.
                 */
-               futex_hash_put(no_free_ptr(hb));
+               futex_private_hash_put(no_free_ptr(hbr.fph));
                /*
                 * Must be done before we enqueue the waiter, here is unfortunately
                 * under the hb lock, but that *should* work because it does nothing.
@@ -1100,11 +1101,9 @@ no_block:
                __release(&hb->lock);
                futex_unqueue_pi(&q);
                spin_unlock(q.lock_ptr);
-               if (q.drop_hb_ref) {
-                       CLASS(hb, hb)(&q.key);
-                       /* Additional reference from futex_unlock_pi() */
-                       futex_hash_put(hb);
-               }
+
+               /* Additional reference from futex_unlock_pi() */
+               futex_private_hash_put(q.drop_fph);
                goto out;
 
 out_unlock_put_key:
@@ -1161,7 +1160,8 @@ retry:
        if (ret)
                return ret;
 
-       CLASS(hb, hb)(&key);
+       CLASS(hbr, hbr)(&key);
+       auto hb = hbr.hb;
        spin_lock(&hb->lock);
 retry_hb:
 
@@ -1218,8 +1218,9 @@ retry_hb:
                         * Acquire a reference for the leaving waiter to ensure
                         * valid futex_q::lock_ptr.
                         */
-                       futex_hash_get(hb);
-                       top_waiter->drop_hb_ref = true;
+                       if (futex_key_is_private(&key))
+                               top_waiter->drop_fph = futex_private_hash(key.private.mm);
+
                        __futex_unqueue(top_waiter);
                        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
                        goto retry_hb;
index b597cb3d17fc110014e56c20ed268e220e323886..79823ad136830a871b7fca255ebbf06d2f60c806 100644 (file)
@@ -241,8 +241,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
         * Acquire a reference for the waiter to ensure valid
         * futex_q::lock_ptr.
         */
-       futex_hash_get(hb);
-       q->drop_hb_ref = true;
+       if (futex_key_is_private(key))
+               q->drop_fph = futex_private_hash(key->private.mm);
        q->lock_ptr = &hb->lock;
        task = READ_ONCE(q->task);
 
@@ -459,8 +459,10 @@ retry:
 
 retry_private:
        if (1) {
-               CLASS(hb, hb1)(&key1);
-               CLASS(hb, hb2)(&key2);
+               CLASS(hbr, hbr1)(&key1);
+               CLASS(hbr, hbr2)(&key2);
+               auto hb1 = hbr1.hb;
+               auto hb2 = hbr2.hb;
 
                futex_hb_waiters_inc(hb2);
                double_lock_hb(hb1, hb2);
@@ -832,7 +834,8 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
        switch (futex_requeue_pi_wakeup_sync(&q)) {
        case Q_REQUEUE_PI_IGNORE:
                {
-                       CLASS(hb, hb)(&q.key);
+                       CLASS(hbr, hbr)(&q.key);
+                       auto hb = hbr.hb;
                        /* The waiter is still on uaddr1 */
                        spin_lock(&hb->lock);
                        ret = handle_early_requeue_pi_wakeup(hb, &q, to);
@@ -902,11 +905,8 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
        default:
                BUG();
        }
-       if (q.drop_hb_ref) {
-               CLASS(hb, hb)(&q.key);
-               /* Additional reference from requeue_pi_wake_futex() */
-               futex_hash_put(hb);
-       }
+       /* Additional reference from requeue_pi_wake_futex() */
+       futex_private_hash_put(q.drop_fph);
 
 out:
        if (to) {
index 8f5e5d3023568f2435b8f40fdeb04d8e2177d196..d4483d15d30a5d3fa2d4968338a3542313a40cf7 100644 (file)
@@ -195,7 +195,8 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, void __user *pop, int nr_w
        if ((flags & FLAGS_STRICT) && !nr_wake)
                return 0;
 
-       CLASS(hb, hb)(&key);
+       CLASS(hbr, hbr)(&key);
+       auto hb = hbr.hb;
 
        /* Make sure we really have tasks to wakeup */
        if (!futex_hb_waiters_pending(hb))
@@ -292,8 +293,10 @@ retry:
 
 retry_private:
        if (1) {
-               CLASS(hb, hb1)(&key1);
-               CLASS(hb, hb2)(&key2);
+               CLASS(hbr, hbr1)(&key1);
+               CLASS(hbr, hbr2)(&key2);
+               auto hb1 = hbr1.hb;
+               auto hb2 = hbr2.hb;
 
                double_lock_hb(hb1, hb2);
                op_ret = futex_atomic_op_inuser(op, uaddr2);
@@ -435,7 +438,7 @@ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
         * Make sure to have a reference on the private_hash such that we
         * don't block on rehash after changing the task state below.
         */
-       guard(private_hash)();
+       guard(private_hash)(current->mm);
 
        /*
         * Enqueuing multiple futexes is tricky, because we need to enqueue
@@ -472,7 +475,8 @@ retry:
                u32 val = vs[i].w.val;
 
                if (1) {
-                       CLASS(hb, hb)(&q->key);
+                       CLASS(hbr, hbr)(&q->key);
+                       auto hb = hbr.hb;
 
                        futex_q_lock(q, hb);
                        ret = futex_get_value_locked(&uval, uaddr);
@@ -647,7 +651,8 @@ retry:
 
 retry_private:
        if (1) {
-               CLASS(hb, hb)(&q->key);
+               CLASS(hbr, hbr)(&q->key);
+               auto hb = hbr.hb;
 
                futex_q_lock(q, hb);