]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
x86/vdso: Implement __vdso_futex_robust_try_unlock()
authorThomas Gleixner <tglx@kernel.org>
Tue, 2 Jun 2026 09:10:12 +0000 (11:10 +0200)
committerPeter Zijlstra <peterz@infradead.org>
Wed, 3 Jun 2026 09:38:52 +0000 (11:38 +0200)
When the FUTEX_ROBUST_UNLOCK mechanism is used for unlocking (PI-)futexes,
then the unlock sequence in userspace looks like this:

  1) robust_list_set_op_pending(mutex);
  2) robust_list_remove(mutex);

   lval = gettid();
  3) if (atomic_try_cmpxchg(&mutex->lock, lval, 0))
  4) robust_list_clear_op_pending();
   else
  5) sys_futex(OP,...FUTEX_ROBUST_UNLOCK);

That still leaves a minimal race window between #3 and #4 where the mutex
could be acquired by some other task which observes that it is the last
user and:

  1) unmaps the mutex memory
  2) maps a different file, which ends up covering the same address

When then the original task exits before reaching #5 then the kernel robust
list handling observes the pending op entry and tries to fix up user space.

In case that the newly mapped data contains the TID of the exiting thread
at the address of the mutex/futex the kernel will set the owner died bit in
that memory and therefore corrupt unrelated data.

Provide a VDSO function which exposes the critical section window in the
VDSO symbol table. The resulting addresses are updated in the task's mm
when the VDSO is (re)map()'ed.

The core code detects when a task was interrupted within the critical
section and is about to deliver a signal. It then invokes an architecture
specific function which determines whether the pending op pointer has to be
cleared or not. The unlock assembly sequence on 64-bit is:

mov %esi,%eax // Load TID into EAX
        xor %ecx,%ecx // Set ECX to 0
lock cmpxchg %ecx,(%rdi) // Try the TID -> 0 transition
  .Lstart:
jnz     .Lend
movq %rcx,(%rdx) // Clear list_op_pending
  .Lend:
ret

So the decision can be simply based on the ZF state in regs->flags. The
pending op pointer is always in DX independent of the build mode
(32/64-bit) to make the pending op pointer retrieval uniform. The size of
the pointer is stored in the matching criticial section range struct and
the core code retrieves it from there. So the pointer retrieval function
does not have to care. It is bit-size independent:

     return regs->flags & X86_EFLAGS_ZF ? regs->dx : NULL;

There are two entry points to handle the different robust list pending op
pointer size:

__vdso_futex_robust_list64_try_unlock()
__vdso_futex_robust_list32_try_unlock()

The 32-bit VDSO provides only __vdso_futex_robust_list32_try_unlock().

The 64-bit VDSO provides always __vdso_futex_robust_list64_try_unlock() and
when COMPAT is enabled also the list32 variant, which is required to
support multi-size robust list pointers used by gaming emulators.

The unlock function is inspired by an idea from Mathieu Desnoyers.

Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: André Almeida <andrealmeid@igalia.com>
Acked-by: Uros Bizjak <ubizjak@gmail.com>
Link: https://lore.kernel.org/20260311185409.1988269-1-mathieu.desnoyers@efficios.com
Link: https://patch.msgid.link/20260602090535.883796247@kernel.org
arch/x86/Kconfig
arch/x86/entry/vdso/common/vfutex.c [new file with mode: 0644]
arch/x86/entry/vdso/vdso32/Makefile
arch/x86/entry/vdso/vdso32/vdso32.lds.S
arch/x86/entry/vdso/vdso32/vfutex.c [new file with mode: 0644]
arch/x86/entry/vdso/vdso64/Makefile
arch/x86/entry/vdso/vdso64/vdso64.lds.S
arch/x86/entry/vdso/vdso64/vdsox32.lds.S
arch/x86/entry/vdso/vdso64/vfutex.c [new file with mode: 0644]
arch/x86/include/asm/futex_robust.h [new file with mode: 0644]

index 1ce62a996192e4688f5be7eac7572b696ca4d91a..fdaef60b46d657c00efcc01d35ea0e8914c2f05a 100644 (file)
@@ -239,6 +239,7 @@ config X86
        select HAVE_EFFICIENT_UNALIGNED_ACCESS
        select HAVE_EISA                        if X86_32
        select HAVE_EXIT_THREAD
+       select HAVE_FUTEX_ROBUST_UNLOCK
        select HAVE_GENERIC_TIF_BITS
        select HAVE_GUP_FAST
        select HAVE_FENTRY                      if X86_64 || DYNAMIC_FTRACE
diff --git a/arch/x86/entry/vdso/common/vfutex.c b/arch/x86/entry/vdso/common/vfutex.c
new file mode 100644 (file)
index 0000000..454f059
--- /dev/null
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <vdso/futex.h>
+
+/*
+ * Assembly template for the try unlock functions. The basic functionality is:
+ *
+ *             mov             esi, %eax       Move the TID into EAX
+ *             xor             %ecx, %ecx      Clear ECX
+ *             lock_cmpxchgl   %ecx, (%rdi)    Attempt the TID -> 0 transition
+ * .Lcs_start:                                 Start of the critical section
+ *             jnz             .Lcs_end        If cmpxchl failed jump to the end
+ * .Lcs_success:                               Start of the success section
+ *             movq            %rcx, (%rdx)    Set the pending op pointer to 0
+ * .Lcs_end:                                   End of the critical section
+ *
+ * .Lcs_start and .Lcs_end establish the critical section range. .Lcs_success is
+ * technically not required, but there for illustration, debugging and testing.
+ *
+ * When CONFIG_COMPAT is enabled then the 64-bit VDSO provides two functions.
+ * One for the regular 64-bit sized pending operation pointer and one for a
+ * 32-bit sized pointer to support gaming emulators.
+ *
+ * The 32-bit VDSO provides only the one for 32-bit sized pointers.
+ */
+#define __stringify_1(x...)    #x
+#define __stringify(x...)      __stringify_1(x)
+
+#define LABEL(prefix, which)   __stringify(prefix##_try_unlock_cs_##which:)
+
+#define JNZ_END(prefix)                "jnz " __stringify(prefix) "_try_unlock_cs_end\n"
+
+#define CLEAR_POPQ             "movq   %[zero],  %a[pop]\n"
+#define CLEAR_POPL             "movl   %k[zero], %a[pop]\n"
+
+#define futex_robust_try_unlock(prefix, clear_pop, __lock, __tid, __pop)\
+({                                                                     \
+       asm volatile (                                                  \
+               "                                               \n"     \
+               "       lock cmpxchgl   %k[zero], %a[lock]      \n"     \
+               "                                               \n"     \
+               LABEL(prefix, start)                                    \
+               "                                               \n"     \
+               JNZ_END(prefix)                                         \
+               "                                               \n"     \
+               LABEL(prefix, success)                                  \
+               "                                               \n"     \
+                       clear_pop                                       \
+               "                                               \n"     \
+               LABEL(prefix, end)                                      \
+               : [tid]   "+&a" (__tid)                                 \
+               : [lock]  "D"   (__lock),                               \
+                 [pop]   "d"   (__pop),                                \
+                 [zero]  "r"   (0UL)                                   \
+               : "memory"                                              \
+       );                                                              \
+       __tid;                                                          \
+})
+
+#ifdef __x86_64__
+__u32 __vdso_futex_robust_list64_try_unlock(__u32 *lock, __u32 tid, __u64 *pop)
+{
+       return futex_robust_try_unlock(__futex_list64, CLEAR_POPQ, lock, tid, pop);
+}
+#endif /* __x86_64__ */
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
+__u32 __vdso_futex_robust_list32_try_unlock(__u32 *lock, __u32 tid, __u32 *pop)
+{
+       return futex_robust_try_unlock(__futex_list32, CLEAR_POPL, lock, tid, pop);
+}
+#endif /* CONFIG_X86_32 || CONFIG_COMPAT */
index ded4fc6a48cdcf94ba05e6b8fdf0884c117e8843..ab4b1f635f66048448e67bff561250e42dc7c510 100644 (file)
@@ -7,8 +7,9 @@
 vdsos-y                        := 32
 
 # Files to link into the vDSO:
-vobjs-y                        := note.o vclock_gettime.o vgetcpu.o
-vobjs-y                        += system_call.o sigreturn.o
+vobjs-y                                        := note.o vclock_gettime.o vgetcpu.o
+vobjs-y                                        += system_call.o sigreturn.o
+vobjs-$(CONFIG_FUTEX_ROBUST_UNLOCK)    += vfutex.o
 
 # Compilation flags
 flags-y                        := -DBUILD_VDSO32 -m32 -mregparm=0
index 55554f80d930e0c320d6ebebb36a707785fe333b..cee8f7f9fe80d42a6a71abb4cbff799bab9549f7 100644 (file)
@@ -30,6 +30,9 @@ VERSION
                __vdso_clock_gettime64;
                __vdso_clock_getres_time64;
                __vdso_getcpu;
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+               __vdso_futex_robust_list32_try_unlock;
+#endif
        };
 
        LINUX_2.5 {
diff --git a/arch/x86/entry/vdso/vdso32/vfutex.c b/arch/x86/entry/vdso/vdso32/vfutex.c
new file mode 100644 (file)
index 0000000..940a6ee
--- /dev/null
@@ -0,0 +1 @@
+#include "common/vfutex.c"
index bfffaf1aeecce774825d7f760c1e1ac46ace31de..7c0790065b5e766379ac5a0c2eaca78e8d92d4a7 100644 (file)
@@ -8,9 +8,10 @@ vdsos-y                                := 64
 vdsos-$(CONFIG_X86_X32_ABI)    += x32
 
 # Files to link into the vDSO:
-vobjs-y                                := note.o vclock_gettime.o vgetcpu.o
-vobjs-y                                += vgetrandom.o vgetrandom-chacha.o
-vobjs-$(CONFIG_X86_SGX)                += vsgx.o
+vobjs-y                                        := note.o vclock_gettime.o vgetcpu.o
+vobjs-y                                        += vgetrandom.o vgetrandom-chacha.o
+vobjs-$(CONFIG_X86_SGX)                        += vsgx.o
+vobjs-$(CONFIG_FUTEX_ROBUST_UNLOCK)    += vfutex.o
 
 # Compilation flags
 flags-y                                := -DBUILD_VDSO64 -m64 -mcmodel=small
index 5ce3f2b6373a23c3df5fc8689391a708cc808f6b..4a72122da81b575812ebc9b6ec18488a91216800 100644 (file)
@@ -32,6 +32,13 @@ VERSION {
 #endif
                getrandom;
                __vdso_getrandom;
+
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+               __vdso_futex_robust_list64_try_unlock;
+#ifdef CONFIG_COMPAT
+               __vdso_futex_robust_list32_try_unlock;
+#endif
+#endif
        local: *;
        };
 }
index 3dbd20c8dacc661b9787796144d52944591dc37d..b917dc69f62f8ce716df7fb41d851ff7bcbe038f 100644 (file)
@@ -22,6 +22,13 @@ VERSION {
                __vdso_getcpu;
                __vdso_time;
                __vdso_clock_getres;
+
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+               __vdso_futex_robust_list64_try_unlock;
+#ifdef CONFIG_COMPAT
+               __vdso_futex_robust_list32_try_unlock;
+#endif
+#endif
        local: *;
        };
 }
diff --git a/arch/x86/entry/vdso/vdso64/vfutex.c b/arch/x86/entry/vdso/vdso64/vfutex.c
new file mode 100644 (file)
index 0000000..940a6ee
--- /dev/null
@@ -0,0 +1 @@
+#include "common/vfutex.c"
diff --git a/arch/x86/include/asm/futex_robust.h b/arch/x86/include/asm/futex_robust.h
new file mode 100644 (file)
index 0000000..e879547
--- /dev/null
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FUTEX_ROBUST_H
+#define _ASM_X86_FUTEX_ROBUST_H
+
+#include <asm/ptrace.h>
+
+static __always_inline void __user *x86_futex_robust_unlock_get_pop(struct pt_regs *regs)
+{
+       /*
+        * If ZF is set then the cmpxchg succeeded and the pending op pointer
+        * needs to be cleared.
+        */
+       return regs->flags & X86_EFLAGS_ZF ? (void __user *)regs->dx : NULL;
+}
+
+#define arch_futex_robust_unlock_get_pop(regs) \
+       x86_futex_robust_unlock_get_pop(regs)
+
+#endif /* _ASM_X86_FUTEX_ROBUST_H */