#include <linux/fault-inject.h>
#include <linux/slab.h>
#include <linux/prctl.h>
-#include <linux/rcuref.h>
#include <linux/mempolicy.h>
#include <linux/mmap_lock.h>
#define futex_queues (__futex_data.queues)
struct futex_private_hash {
- rcuref_t users;
+ int state;
unsigned int hash_mask;
struct rcu_head rcu;
void *mm;
__futex_hash(union futex_key *key, struct futex_private_hash *fph);
#ifdef CONFIG_FUTEX_PRIVATE_HASH
+static bool futex_ref_get(struct futex_private_hash *fph);
+static bool futex_ref_put(struct futex_private_hash *fph);
+static bool futex_ref_is_dead(struct futex_private_hash *fph);
+
+enum { FR_PERCPU = 0, FR_ATOMIC };
+
static inline bool futex_key_is_private(union futex_key *key)
{
/*
{
if (fph->immutable)
return true;
- return rcuref_get(&fph->users);
+ return futex_ref_get(fph);
}
void futex_private_hash_put(struct futex_private_hash *fph)
{
- /* Ignore return value, last put is verified via rcuref_is_dead() */
if (fph->immutable)
return;
- if (rcuref_put(&fph->users))
+ if (futex_ref_put(fph))
wake_up_var(fph->mm);
}
fph = rcu_dereference_protected(mm->futex_phash,
lockdep_is_held(&mm->futex_hash_lock));
if (fph) {
- if (!rcuref_is_dead(&fph->users)) {
+ if (!futex_ref_is_dead(fph)) {
mm->futex_phash_new = new;
return false;
}
futex_rehash_private(fph, new);
}
- rcu_assign_pointer(mm->futex_phash, new);
+ new->state = FR_PERCPU;
+ scoped_guard(rcu) {
+ mm->futex_batches = get_state_synchronize_rcu();
+ rcu_assign_pointer(mm->futex_phash, new);
+ }
kvfree_rcu(fph, rcu);
return true;
}
if (!fph)
return NULL;
- if (fph->immutable)
- return fph;
- if (rcuref_get(&fph->users))
+ if (futex_private_hash_get(fph))
return fph;
}
futex_pivot_hash(mm);
#define FH_IMMUTABLE 0x02
#ifdef CONFIG_FUTEX_PRIVATE_HASH
+
+/*
+ * futex-ref
+ *
+ * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that
+ * code because it just doesn't fit right.
+ *
+ * Dual counter, per-cpu / atomic approach like percpu-refcount, except it
+ * re-initializes the state automatically, such that the fph swizzle is also a
+ * transition back to per-cpu.
+ */
+
+static void futex_ref_rcu(struct rcu_head *head);
+
+static void __futex_ref_atomic_begin(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ /*
+ * The counter we're about to switch to must have fully switched;
+ * otherwise it would be impossible for it to have reported success
+ * from futex_ref_is_dead().
+ */
+ WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0);
+
+ /*
+ * Set the atomic to the bias value such that futex_ref_{get,put}()
+ * will never observe 0. Will be fixed up in __futex_ref_atomic_end()
+ * when folding in the percpu count.
+ */
+ atomic_long_set(&mm->futex_atomic, LONG_MAX);
+ smp_store_release(&fph->state, FR_ATOMIC);
+
+ call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+}
+
+static void __futex_ref_atomic_end(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+ unsigned int count = 0;
+ long ret;
+ int cpu;
+
+ /*
+ * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC
+ * and per this RCU callback, everybody must now observe this state and
+ * use the atomic variable.
+ */
+ WARN_ON_ONCE(fph->state != FR_ATOMIC);
+
+ /*
+ * Therefore the per-cpu counter is now stable, sum and reset.
+ */
+ for_each_possible_cpu(cpu) {
+ unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu);
+ count += *ptr;
+ *ptr = 0;
+ }
+
+ /*
+ * Re-init for the next cycle.
+ */
+ this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+
+ /*
+ * Add actual count, subtract bias and initial refcount.
+ *
+ * The moment this atomic operation happens, futex_ref_is_dead() can
+ * become true.
+ */
+ ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic);
+ if (!ret)
+ wake_up_var(mm);
+
+ WARN_ON_ONCE(ret < 0);
+ mmput_async(mm);
+}
+
+static void futex_ref_rcu(struct rcu_head *head)
+{
+ struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu);
+ struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash);
+
+ if (fph->state == FR_PERCPU) {
+ /*
+ * Per this extra grace-period, everybody must now observe
+ * fph as the current fph and no previously observed fph's
+ * are in-flight.
+ *
+ * Notably, nobody will now rely on the atomic
+ * futex_ref_is_dead() state anymore so we can begin the
+ * migration of the per-cpu counter into the atomic.
+ */
+ __futex_ref_atomic_begin(fph);
+ return;
+ }
+
+ __futex_ref_atomic_end(fph);
+}
+
+/*
+ * Drop the initial refcount and transition to atomics.
+ */
+static void futex_ref_drop(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ /*
+ * Can only transition the current fph;
+ */
+ WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph);
+ /*
+ * We enqueue at least one RCU callback. Ensure mm stays if the task
+ * exits before the transition is completed.
+ */
+ mmget(mm);
+
+ /*
+ * In order to avoid the following scenario:
+ *
+ * futex_hash() __futex_pivot_hash()
+ * guard(rcu); guard(mm->futex_hash_lock);
+ * fph = mm->futex_phash;
+ * rcu_assign_pointer(&mm->futex_phash, new);
+ * futex_hash_allocate()
+ * futex_ref_drop()
+ * fph->state = FR_ATOMIC;
+ * atomic_set(, BIAS);
+ *
+ * futex_private_hash_get(fph); // OOPS
+ *
+ * Where an old fph (which is FR_ATOMIC) and should fail on
+ * inc_not_zero, will succeed because a new transition is started and
+ * the atomic is bias'ed away from 0.
+ *
+ * There must be at least one full grace-period between publishing a
+ * new fph and trying to replace it.
+ */
+ if (poll_state_synchronize_rcu(mm->futex_batches)) {
+ /*
+ * There was a grace-period, we can begin now.
+ */
+ __futex_ref_atomic_begin(fph);
+ return;
+ }
+
+ call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+}
+
+static bool futex_ref_get(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(rcu)();
+
+ if (smp_load_acquire(&fph->state) == FR_PERCPU) {
+ this_cpu_inc(*mm->futex_ref);
+ return true;
+ }
+
+ return atomic_long_inc_not_zero(&mm->futex_atomic);
+}
+
+static bool futex_ref_put(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(rcu)();
+
+ if (smp_load_acquire(&fph->state) == FR_PERCPU) {
+ this_cpu_dec(*mm->futex_ref);
+ return false;
+ }
+
+ return atomic_long_dec_and_test(&mm->futex_atomic);
+}
+
+static bool futex_ref_is_dead(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(rcu)();
+
+ if (smp_load_acquire(&fph->state) == FR_PERCPU)
+ return false;
+
+ return atomic_long_read(&mm->futex_atomic) == 0;
+}
+
+int futex_mm_init(struct mm_struct *mm)
+{
+ mutex_init(&mm->futex_hash_lock);
+ RCU_INIT_POINTER(mm->futex_phash, NULL);
+ mm->futex_phash_new = NULL;
+ /* futex-ref */
+ atomic_long_set(&mm->futex_atomic, 0);
+ mm->futex_batches = get_state_synchronize_rcu();
+ mm->futex_ref = alloc_percpu(unsigned int);
+ if (!mm->futex_ref)
+ return -ENOMEM;
+ this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+ return 0;
+}
+
void futex_hash_free(struct mm_struct *mm)
{
struct futex_private_hash *fph;
+ free_percpu(mm->futex_ref);
kvfree(mm->futex_phash_new);
fph = rcu_dereference_raw(mm->futex_phash);
- if (fph) {
- WARN_ON_ONCE(rcuref_read(&fph->users) > 1);
+ if (fph)
kvfree(fph);
- }
}
static bool futex_pivot_pending(struct mm_struct *mm)
return true;
fph = rcu_dereference(mm->futex_phash);
- return rcuref_is_dead(&fph->users);
+ return futex_ref_is_dead(fph);
}
static bool futex_hash_less(struct futex_private_hash *a,
}
}
- fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ fph = kvzalloc(struct_size(fph, queues, hash_slots),
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!fph)
return -ENOMEM;
- rcuref_init(&fph->users, 1);
fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
fph->custom = custom;
fph->immutable = !!(flags & FH_IMMUTABLE);
* allocated a replacement hash, drop the initial
* reference on the existing hash.
*/
- futex_private_hash_put(cur);
+ futex_ref_drop(cur);
}
if (new) {