x86: Cleanup pthread_spin_{try}lock.S

author Noah Goldstein <goldstein.w.n@gmail.com>

Sat, 1 Oct 2022 04:13:27 +0000 (21:13 -0700)

committer Noah Goldstein <goldstein.w.n@gmail.com>

Mon, 3 Oct 2022 21:13:49 +0000 (14:13 -0700)
author Noah Goldstein <goldstein.w.n@gmail.com>
Sat, 1 Oct 2022 04:13:27 +0000 (21:13 -0700)
committer Noah Goldstein <goldstein.w.n@gmail.com>
Mon, 3 Oct 2022 21:13:49 +0000 (14:13 -0700)
diff --git a/sysdeps/x86_64/nptl/pthread_spin_lock.S b/sysdeps/x86_64/nptl/pthread_spin_lock.S

index 44b837d9dbd822e1f3d55ea785a2ed47d23e5662..1e09e59b10d42c3a8de2af5e9619bdafdedb0f1e 100644 (file)
--- a/sysdeps/x86_64/nptl/pthread_spin_lock.S
+++ b/sysdeps/x86_64/nptl/pthread_spin_lock.S
@@ -19,18 +19,27 @@
  #include <shlib-compat.h>
  
  ENTRY(__pthread_spin_lock)
-1:     LOCK
-       decl    0(%rdi)
-       jne     2f
+       /* Always return zero.  */
         xor     %eax, %eax
+       LOCK
+       decl    0(%rdi)
+       jne     1f
         ret
  
         .align  16
-2:     rep
+1:
+       /* `rep nop` == `pause`.  */
+       rep
         nop
-       cmpl    $0, 0(%rdi)
-       jg      1b
-       jmp     2b
+       cmpl    %eax, 0(%rdi)
+       jle     1b
+       /* Just repeat the `lock decl` logic here.  The code size save
+          of jumping back to entry doesn't change how many 16-byte
+          chunks (default function alignment) that the code fits in.  */
+       LOCK
+       decl    0(%rdi)
+       jne     1b
+       ret
  END(__pthread_spin_lock)
  versioned_symbol (libc, __pthread_spin_lock, pthread_spin_lock, GLIBC_2_34)
  
diff --git a/sysdeps/x86_64/nptl/pthread_spin_trylock.S b/sysdeps/x86_64/nptl/pthread_spin_trylock.S

index fffdb27dd9ed54cae988bb735e1fa6b1c0f444e4..a1f97cb420af579434d1f1e02605163c0799f8c8 100644 (file)
--- a/sysdeps/x86_64/nptl/pthread_spin_trylock.S
+++ b/sysdeps/x86_64/nptl/pthread_spin_trylock.S
@@ -20,13 +20,21 @@
  #include <shlib-compat.h>
  
  ENTRY(__pthread_spin_trylock)
-       movl    $1, %eax
         xorl    %ecx, %ecx
-       lock
-       cmpxchgl %ecx, (%rdi)
+       /* xchg has implicit LOCK prefix.  */
+       xchgl   %ecx, (%rdi)
+
+       /* Branch on result.  Expectation is the use of trylock will be
+          branching on success/failure so this branch can be used to
+          to predict the coming branch.  It has the benefit of
+          breaking the likely expensive memory dependency on (%rdi).  */
+       cmpl    $1, %ecx
+       jnz     1f
+       xorl    %eax, %eax
+       ret
+1:
         movl    $EBUSY, %eax
-       cmovel  %ecx, %eax
-       retq
+       ret
  END(__pthread_spin_trylock)
  versioned_symbol (libc, __pthread_spin_trylock, pthread_spin_trylock,
                   GLIBC_2_34)
author	Noah Goldstein <goldstein.w.n@gmail.com>
	Sat, 1 Oct 2022 04:13:27 +0000 (21:13 -0700)
committer	Noah Goldstein <goldstein.w.n@gmail.com>
	Mon, 3 Oct 2022 21:13:49 +0000 (14:13 -0700)
sysdeps/x86_64/nptl/pthread_spin_lock.S		patch \| blob \| blame \| history
sysdeps/x86_64/nptl/pthread_spin_trylock.S		patch \| blob \| blame \| history