x86/fpu: Cleanup code in svml_{s|d}_wrapper_impl.h

author Noah Goldstein <goldstein.w.n@gmail.com>

Sat, 19 Nov 2022 00:13:31 +0000 (16:13 -0800)

committer Noah Goldstein <goldstein.w.n@gmail.com>

Mon, 28 Nov 2022 04:22:49 +0000 (20:22 -0800)
author Noah Goldstein <goldstein.w.n@gmail.com>
Sat, 19 Nov 2022 00:13:31 +0000 (16:13 -0800)
committer Noah Goldstein <goldstein.w.n@gmail.com>
Mon, 28 Nov 2022 04:22:49 +0000 (20:22 -0800)
diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h

index b03a2122b91bfe23c1af075fbf224eeb6ee19997..9900f85a5568896385c20d713574a09a7fffe5b6 100644 (file)
--- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
@@ -18,39 +18,38 @@
  
  /* SSE2 ISA version as wrapper to scalar.  */
  .macro WRAPPER_IMPL_SSE2 callee
-       subq    $40, %rsp
-       cfi_adjust_cfa_offset (40)
+       subq    $24, %rsp
+       cfi_adjust_cfa_offset (24)
         movaps  %xmm0, (%rsp)
         call    JUMPTARGET(\callee)
-       movsd   %xmm0, 16(%rsp)
+       movsd   %xmm0, (%rsp)
         movsd   8(%rsp), %xmm0
         call    JUMPTARGET(\callee)
-       movsd   16(%rsp), %xmm1
-       movsd   %xmm0, 24(%rsp)
+       movsd   (%rsp), %xmm1
         unpcklpd %xmm0, %xmm1
         movaps  %xmm1, %xmm0
-       addq    $40, %rsp
-       cfi_adjust_cfa_offset (-40)
+       addq    $24, %rsp
+       cfi_adjust_cfa_offset (-24)
         ret
  .endm
  
+
  /* 2 argument SSE2 ISA version as wrapper to scalar.  */
  .macro WRAPPER_IMPL_SSE2_ff callee
-       subq    $56, %rsp
-       cfi_adjust_cfa_offset (56)
+       subq    $40, %rsp
+       cfi_adjust_cfa_offset (40)
         movaps  %xmm0, (%rsp)
         movaps  %xmm1, 16(%rsp)
         call    JUMPTARGET(\callee)
-       movsd   %xmm0, 32(%rsp)
+       movsd   %xmm0, (%rsp)
         movsd   8(%rsp), %xmm0
         movsd   24(%rsp), %xmm1
         call    JUMPTARGET(\callee)
-       movsd   32(%rsp), %xmm1
-       movsd   %xmm0, 40(%rsp)
+       movsd   (%rsp), %xmm1
         unpcklpd %xmm0, %xmm1
         movaps  %xmm1, %xmm0
-       addq    $56, %rsp
-       cfi_adjust_cfa_offset (-56)
+       addq    $40, %rsp
+       cfi_adjust_cfa_offset (-40)
         ret
  .endm
  
@@ -62,30 +61,18 @@
         pushq   %rbx
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbx, 0)
+       subq    $24, %rsp
+       cfi_adjust_cfa_offset (24)
+       movaps  %xmm0, (%rsp)
         movq    %rdi, %rbp
         movq    %rsi, %rbx
-       subq    $40, %rsp
-       cfi_adjust_cfa_offset (40)
-       leaq    16(%rsp), %rsi
-       leaq    24(%rsp), %rdi
-       movaps  %xmm0, (%rsp)
         call    JUMPTARGET(\callee)
-       leaq    16(%rsp), %rsi
-       leaq    24(%rsp), %rdi
-       movsd   24(%rsp), %xmm0
-       movapd  (%rsp), %xmm1
-       movsd   %xmm0, 0(%rbp)
-       unpckhpd %xmm1, %xmm1
-       movsd   16(%rsp), %xmm0
-       movsd   %xmm0, (%rbx)
-       movapd  %xmm1, %xmm0
+       movsd   8(%rsp), %xmm0
+       leaq    8(%rbp), %rdi
+       leaq    8(%rbx), %rsi
         call    JUMPTARGET(\callee)
-       movsd   24(%rsp), %xmm0
-       movsd   %xmm0, 8(%rbp)
-       movsd   16(%rsp), %xmm0
-       movsd   %xmm0, 8(%rbx)
-       addq    $40, %rsp
-       cfi_adjust_cfa_offset (-40)
+       addq    $24, %rsp
+       cfi_adjust_cfa_offset (-24)
         popq    %rbx
         cfi_adjust_cfa_offset (-8)
         cfi_restore (%rbx)
@@ -104,15 +91,17 @@
         cfi_def_cfa_register (%rbp)
         andq    $-32, %rsp
         subq    $32, %rsp
-       vextractf128 $1, %ymm0, (%rsp)
+       vmovaps %ymm0, (%rsp)
         vzeroupper
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovapd %xmm0, 16(%rsp)
-       vmovaps (%rsp), %xmm0
+       vmovaps %xmm0, (%rsp)
+       vmovaps 16(%rsp), %xmm0
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovapd %xmm0, %xmm1
-       vmovapd 16(%rsp), %xmm0
-       vinsertf128 $1, %xmm1, %ymm0, %ymm0
+       /* combine xmm0 (return of second call) with result of first
+          call (saved on stack). Might be worth exploring logic that
+          uses `vpblend` and reads in ymm1 using -16(rsp).  */
+       vmovaps (%rsp), %xmm1
+       vinsertf128 $1, %xmm0, %ymm1, %ymm0
         movq    %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq    %rbp
@@ -130,17 +119,19 @@
         cfi_def_cfa_register (%rbp)
         andq    $-32, %rsp
         subq    $64, %rsp
-       vextractf128 $1, %ymm0, 16(%rsp)
-       vextractf128 $1, %ymm1, (%rsp)
+       vmovaps %ymm0, (%rsp)
+       vmovaps %ymm1, 32(%rsp)
         vzeroupper
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovaps %xmm0, 32(%rsp)
+       vmovaps 48(%rsp), %xmm1
+       vmovaps %xmm0, (%rsp)
         vmovaps 16(%rsp), %xmm0
-       vmovaps (%rsp), %xmm1
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovaps %xmm0, %xmm1
-       vmovaps 32(%rsp), %xmm0
-       vinsertf128 $1, %xmm1, %ymm0, %ymm0
+       /* combine xmm0 (return of second call) with result of first
+          call (saved on stack). Might be worth exploring logic that
+          uses `vpblend` and reads in ymm1 using -16(rsp).  */
+       vmovaps (%rsp), %xmm1
+       vinsertf128 $1, %xmm0, %ymm1, %ymm0
         movq    %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq    %rbp
@@ -155,35 +146,21 @@
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
         movq    %rsp, %rbp
-       cfi_def_cfa_register (%rbp)
         andq    $-32, %rsp
-       pushq   %r13
-       cfi_adjust_cfa_offset (8)
-       cfi_rel_offset (%r13, 0)
+       subq    $32, %rsp
+       vmovaps %ymm0, (%rsp)
+       pushq   %rbx
         pushq   %r14
-       cfi_adjust_cfa_offset (8)
-       cfi_rel_offset (%r14, 0)
-       subq    $48, %rsp
+       movq    %rdi, %rbx
         movq    %rsi, %r14
-       movq    %rdi, %r13
-       vextractf128 $1, %ymm0, 32(%rsp)
         vzeroupper
         call    HIDDEN_JUMPTARGET(\callee)
         vmovaps 32(%rsp), %xmm0
-       lea     (%rsp), %rdi
-       lea     16(%rsp), %rsi
+       leaq    16(%rbx), %rdi
+       leaq    16(%r14), %rsi
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovapd (%rsp), %xmm0
-       vmovapd 16(%rsp), %xmm1
-       vmovapd %xmm0, 16(%r13)
-       vmovapd %xmm1, 16(%r14)
-       addq    $48, %rsp
         popq    %r14
-       cfi_adjust_cfa_offset (-8)
-       cfi_restore (%r14)
-       popq    %r13
-       cfi_adjust_cfa_offset (-8)
-       cfi_restore (%r13)
+       popq    %rbx
         movq    %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq    %rbp
@@ -200,15 +177,16 @@
         movq    %rsp, %rbp
         cfi_def_cfa_register (%rbp)
         andq    $-64, %rsp
-       subq    $128, %rsp
+       subq    $64, %rsp
         vmovups %zmm0, (%rsp)
-       vmovupd (%rsp), %ymm0
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovupd %ymm0, 64(%rsp)
+       vmovupd %ymm0, (%rsp)
         vmovupd 32(%rsp), %ymm0
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovupd %ymm0, 96(%rsp)
-       vmovups 64(%rsp), %zmm0
+       /* combine ymm0 (return of second call) with result of first
+          call (saved on stack).  */
+       vmovaps (%rsp), %ymm1
+       vinserti64x4 $0x1, %ymm0, %zmm1, %zmm0
         movq    %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq    %rbp
@@ -225,18 +203,19 @@
         movq    %rsp, %rbp
         cfi_def_cfa_register (%rbp)
         andq    $-64, %rsp
-       subq    $192, %rsp
+       addq    $-128, %rsp
         vmovups %zmm0, (%rsp)
         vmovups %zmm1, 64(%rsp)
-       vmovupd (%rsp), %ymm0
-       vmovupd 64(%rsp), %ymm1
+       /* ymm0 and ymm1 are already set.  */
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovupd %ymm0, 128(%rsp)
-       vmovupd 32(%rsp), %ymm0
-       vmovupd 96(%rsp), %ymm1
+       vmovups 96(%rsp), %ymm1
+       vmovaps %ymm0, (%rsp)
+       vmovups 32(%rsp), %ymm0
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovupd %ymm0, 160(%rsp)
-       vmovups 128(%rsp), %zmm0
+       /* combine ymm0 (return of second call) with result of first
+          call (saved on stack).  */
+       vmovaps (%rsp), %ymm1
+       vinserti64x4 $0x1, %ymm0, %zmm1, %zmm0
         movq    %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq    %rbp
@@ -253,34 +232,20 @@
         movq    %rsp, %rbp
         cfi_def_cfa_register (%rbp)
         andq    $-64, %rsp
-       pushq   %r12
-       cfi_adjust_cfa_offset (8)
-       cfi_rel_offset (%r12, 0)
-       pushq   %r13
-       cfi_adjust_cfa_offset (8)
-       cfi_rel_offset (%r13, 0)
-       subq    $176, %rsp
-       movq    %rsi, %r13
-       vmovups %zmm0, (%rsp)
-       movq    %rdi, %r12
-       vmovupd (%rsp), %ymm0
+       subq    $64, %rsp
+       vmovaps %zmm0, (%rsp)
+       pushq   %rbx
+       pushq   %r14
+       movq    %rdi, %rbx
+       movq    %rsi, %r14
+       /* ymm0 is already set.  */
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovupd 32(%rsp), %ymm0
-       lea     64(%rsp), %rdi
-       lea     96(%rsp), %rsi
+       vmovaps 48(%rsp), %ymm0
+       leaq    32(%rbx), %rdi
+       leaq    32(%r14), %rsi
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovupd 64(%rsp), %ymm0
-       vmovupd 96(%rsp), %ymm1
-       vmovupd %ymm0, 32(%r12)
-       vmovupd %ymm1, 32(%r13)
-       vzeroupper
-       addq    $176, %rsp
-       popq    %r13
-       cfi_adjust_cfa_offset (-8)
-       cfi_restore (%r13)
-       popq    %r12
-       cfi_adjust_cfa_offset (-8)
-       cfi_restore (%r12)
+       popq    %r14
+       popq    %rbx
         movq    %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq    %rbp
diff --git a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h

index cecf6c8384dd75b351323db8281802da12beeef8..fd9b3630456b31ae0032fbca64e7d54426c9091b 100644 (file)
--- a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
@@ -18,61 +18,66 @@
  
  /* SSE2 ISA version as wrapper to scalar.  */
  .macro WRAPPER_IMPL_SSE2 callee
-       subq    $40, %rsp
-       cfi_adjust_cfa_offset (40)
+       push    %rbx
+       cfi_adjust_cfa_offset (8)
+       cfi_rel_offset (%rbx, 0)
+       subq    $16, %rsp
+       cfi_adjust_cfa_offset (16)
         movaps  %xmm0, (%rsp)
         call    JUMPTARGET(\callee)
-       movss   %xmm0, 16(%rsp)
+       movss   %xmm0, (%rsp)
         movss   4(%rsp), %xmm0
         call    JUMPTARGET(\callee)
-       movss   %xmm0, 20(%rsp)
+       movss   %xmm0, 4(%rsp)
         movss   8(%rsp), %xmm0
         call    JUMPTARGET(\callee)
-       movss   %xmm0, 24(%rsp)
+       movd    %xmm0, %ebx
         movss   12(%rsp), %xmm0
         call    JUMPTARGET(\callee)
-       movss   16(%rsp), %xmm3
-       movss   20(%rsp), %xmm2
-       movss   24(%rsp), %xmm1
-       movss   %xmm0, 28(%rsp)
-       unpcklps %xmm1, %xmm3
-       unpcklps %xmm0, %xmm2
-       unpcklps %xmm2, %xmm3
-       movaps  %xmm3, %xmm0
-       addq    $40, %rsp
-       cfi_adjust_cfa_offset (-40)
+       movd    %ebx, %xmm1
+       unpcklps %xmm0, %xmm1
+       movsd   (%rsp), %xmm0
+       unpcklpd %xmm1, %xmm0
+       addq    $16, %rsp
+       cfi_adjust_cfa_offset (-16)
+       popq    %rbx
+       cfi_adjust_cfa_offset (-8)
+       cfi_restore (%rbx)
         ret
  .endm
  
  /* 2 argument SSE2 ISA version as wrapper to scalar.  */
  .macro WRAPPER_IMPL_SSE2_ff callee
-       subq    $56, %rsp
-       cfi_adjust_cfa_offset (56)
+       push    %rbx
+       cfi_adjust_cfa_offset (8)
+       cfi_rel_offset (%rbx, 0)
+       subq    $32, %rsp
+       cfi_adjust_cfa_offset (40)
         movaps  %xmm0, (%rsp)
         movaps  %xmm1, 16(%rsp)
         call    JUMPTARGET(\callee)
-       movss   %xmm0, 32(%rsp)
-       movss   4(%rsp), %xmm0
         movss   20(%rsp), %xmm1
+       movss   %xmm0, 0(%rsp)
+       movss   4(%rsp), %xmm0
         call    JUMPTARGET(\callee)
-       movss   %xmm0, 36(%rsp)
-       movss   8(%rsp), %xmm0
         movss   24(%rsp), %xmm1
+       movss   %xmm0, 4(%rsp)
+       movss   8(%rsp), %xmm0
         call    JUMPTARGET(\callee)
-       movss   %xmm0, 40(%rsp)
-       movss   12(%rsp), %xmm0
         movss   28(%rsp), %xmm1
+       movd    %xmm0, %ebx
+       movss   12(%rsp), %xmm0
         call    JUMPTARGET(\callee)
-       movss   32(%rsp), %xmm3
-       movss   36(%rsp), %xmm2
-       movss   40(%rsp), %xmm1
-       movss   %xmm0, 44(%rsp)
-       unpcklps %xmm1, %xmm3
-       unpcklps %xmm0, %xmm2
-       unpcklps %xmm2, %xmm3
-       movaps  %xmm3, %xmm0
-       addq    $56, %rsp
-       cfi_adjust_cfa_offset (-56)
+       /* merge 4x results into xmm0.  */
+       movd    %ebx, %xmm1
+       unpcklps %xmm0, %xmm1
+       movsd   (%rsp), %xmm0
+       unpcklpd %xmm1, %xmm0
+       addq    $32, %rsp
+       cfi_adjust_cfa_offset (-32)
+       popq    %rbx
+       cfi_adjust_cfa_offset (-8)
+       cfi_restore (%rbx)
         ret
  .endm
  
@@ -86,48 +91,24 @@
         cfi_rel_offset (%rbx, 0)
         movq    %rdi, %rbp
         movq    %rsi, %rbx
-       subq    $40, %rsp
-       cfi_adjust_cfa_offset (40)
-       leaq    24(%rsp), %rsi
-       leaq    28(%rsp), %rdi
+       subq    $24, %rsp
+       cfi_adjust_cfa_offset (24)
         movaps  %xmm0, (%rsp)
         call    JUMPTARGET(\callee)
-       leaq    24(%rsp), %rsi
-       leaq    28(%rsp), %rdi
-       movss   28(%rsp), %xmm0
-       movss   %xmm0, 0(%rbp)
-       movaps  (%rsp), %xmm1
-       movss   24(%rsp), %xmm0
-       movss   %xmm0, (%rbx)
-       movaps  %xmm1, %xmm0
-       shufps  $85, %xmm1, %xmm0
+       movss   4(%rsp), %xmm0
+       leaq    4(%rbp), %rdi
+       leaq    4(%rbx), %rsi
         call    JUMPTARGET(\callee)
-       movss   28(%rsp), %xmm0
-       leaq    24(%rsp), %rsi
-       movss   %xmm0, 4(%rbp)
-       leaq    28(%rsp), %rdi
-       movaps  (%rsp), %xmm1
-       movss   24(%rsp), %xmm0
-       movss   %xmm0, 4(%rbx)
-       movaps  %xmm1, %xmm0
-       unpckhps %xmm1, %xmm0
+       movss   8(%rsp), %xmm0
+       leaq    8(%rbp), %rdi
+       leaq    8(%rbx), %rsi
         call    JUMPTARGET(\callee)
-       movaps  (%rsp), %xmm1
-       leaq    24(%rsp), %rsi
-       leaq    28(%rsp), %rdi
-       movss   28(%rsp), %xmm0
-       shufps  $255, %xmm1, %xmm1
-       movss   %xmm0, 8(%rbp)
-       movss   24(%rsp), %xmm0
-       movss   %xmm0, 8(%rbx)
-       movaps  %xmm1, %xmm0
+       movss   12(%rsp), %xmm0
+       leaq    12(%rbp), %rdi
+       leaq    12(%rbx), %rsi
         call    JUMPTARGET(\callee)
-       movss   28(%rsp), %xmm0
-       movss   %xmm0, 12(%rbp)
-       movss   24(%rsp), %xmm0
-       movss   %xmm0, 12(%rbx)
-       addq    $40, %rsp
-       cfi_adjust_cfa_offset (-40)
+       addq    $24, %rsp
+       cfi_adjust_cfa_offset (-24)
         popq    %rbx
         cfi_adjust_cfa_offset (-8)
         cfi_restore (%rbx)
@@ -146,15 +127,17 @@
         cfi_def_cfa_register (%rbp)
         andq    $-32, %rsp
         subq    $32, %rsp
-       vextractf128 $1, %ymm0, (%rsp)
+       vmovaps %ymm0, (%rsp)
         vzeroupper
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovaps %xmm0, 16(%rsp)
-       vmovaps (%rsp), %xmm0
-       call    HIDDEN_JUMPTARGET(\callee)
-       vmovaps %xmm0, %xmm1
+       vmovaps %xmm0, (%rsp)
         vmovaps 16(%rsp), %xmm0
-       vinsertf128 $1, %xmm1, %ymm0, %ymm0
+       call    HIDDEN_JUMPTARGET(\callee)
+       /* combine xmm0 (return of second call) with result of first
+          call (saved on stack). Might be worth exploring logic that
+          uses `vpblend` and reads in ymm1 using -16(rsp).  */
+       vmovaps (%rsp), %xmm1
+       vinsertf128 $1, %xmm0, %ymm1, %ymm0
         movq    %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq    %rbp
@@ -172,17 +155,19 @@
         cfi_def_cfa_register (%rbp)
         andq    $-32, %rsp
         subq    $64, %rsp
-       vextractf128 $1, %ymm0, 16(%rsp)
-       vextractf128 $1, %ymm1, (%rsp)
+       vmovaps %ymm0, (%rsp)
+       vmovaps %ymm1, 32(%rsp)
         vzeroupper
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovaps %xmm0, 32(%rsp)
+       vmovaps 48(%rsp), %xmm1
+       vmovaps %xmm0, (%rsp)
         vmovaps 16(%rsp), %xmm0
-       vmovaps (%rsp), %xmm1
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovaps %xmm0, %xmm1
-       vmovaps 32(%rsp), %xmm0
-       vinsertf128 $1, %xmm1, %ymm0, %ymm0
+       /* combine xmm0 (return of second call) with result of first
+          call (saved on stack). Might be worth exploring logic that
+          uses `vpblend` and reads in ymm1 using -16(rsp).  */
+       vmovaps (%rsp), %xmm1
+       vinsertf128 $1, %xmm0, %ymm1, %ymm0
         movq    %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq    %rbp
@@ -197,38 +182,21 @@
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
         movq    %rsp, %rbp
-       cfi_def_cfa_register (%rbp)
         andq    $-32, %rsp
-       pushq   %r13
-       cfi_adjust_cfa_offset (8)
-       cfi_rel_offset (%r13, 0)
+       subq    $32, %rsp
+       vmovaps %ymm0, (%rsp)
+       pushq   %rbx
         pushq   %r14
-       cfi_adjust_cfa_offset (8)
-       cfi_rel_offset (%r14, 0)
-       subq    $48, %rsp
+       movq    %rdi, %rbx
         movq    %rsi, %r14
-       vmovaps %ymm0, (%rsp)
-       movq    %rdi, %r13
-       vmovaps 16(%rsp), %xmm1
-       vmovaps %xmm1, 32(%rsp)
         vzeroupper
-       vmovaps (%rsp), %xmm0
         call    HIDDEN_JUMPTARGET(\callee)
         vmovaps 32(%rsp), %xmm0
-       lea     (%rsp), %rdi
-       lea     16(%rsp), %rsi
+       leaq    16(%rbx), %rdi
+       leaq    16(%r14), %rsi
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovaps (%rsp), %xmm0
-       vmovaps 16(%rsp), %xmm1
-       vmovaps %xmm0, 16(%r13)
-       vmovaps %xmm1, 16(%r14)
-       addq    $48, %rsp
         popq    %r14
-       cfi_adjust_cfa_offset (-8)
-       cfi_restore (%r14)
-       popq    %r13
-       cfi_adjust_cfa_offset (-8)
-       cfi_restore (%r13)
+       popq    %rbx
         movq    %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq    %rbp
@@ -245,15 +213,16 @@
         movq    %rsp, %rbp
         cfi_def_cfa_register (%rbp)
         andq    $-64, %rsp
-       subq    $128, %rsp
+       subq    $64, %rsp
         vmovups %zmm0, (%rsp)
-       vmovupd (%rsp), %ymm0
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovupd %ymm0, 64(%rsp)
+       vmovupd %ymm0, (%rsp)
         vmovupd 32(%rsp), %ymm0
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovupd %ymm0, 96(%rsp)
-       vmovups 64(%rsp), %zmm0
+       /* combine ymm0 (return of second call) with result of first
+          call (saved on stack).  */
+       vmovaps (%rsp), %ymm1
+       vinserti64x4 $0x1, %ymm0, %zmm1, %zmm0
         movq    %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq    %rbp
@@ -270,18 +239,19 @@
         movq    %rsp, %rbp
         cfi_def_cfa_register (%rbp)
         andq    $-64, %rsp
-       subq    $192, %rsp
+       addq    $-128, %rsp
         vmovups %zmm0, (%rsp)
         vmovups %zmm1, 64(%rsp)
-       vmovups (%rsp), %ymm0
-       vmovups 64(%rsp), %ymm1
+       /* ymm0 and ymm1 are already set.  */
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovups %ymm0, 128(%rsp)
-       vmovups 32(%rsp), %ymm0
         vmovups 96(%rsp), %ymm1
+       vmovaps %ymm0, (%rsp)
+       vmovups 32(%rsp), %ymm0
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovups %ymm0, 160(%rsp)
-       vmovups 128(%rsp), %zmm0
+       /* combine ymm0 (return of second call) with result of first
+          call (saved on stack).  */
+       vmovaps (%rsp), %ymm1
+       vinserti64x4 $0x1, %ymm0, %zmm1, %zmm0
         movq    %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq    %rbp
@@ -298,25 +268,20 @@
         movq    %rsp, %rbp
         cfi_def_cfa_register (%rbp)
         andq    $-64, %rsp
-       pushq   %r12
-       pushq   %r13
-       subq    $176, %rsp
-       movq    %rsi, %r13
+       subq    $64, %rsp
         vmovaps %zmm0, (%rsp)
-       movq    %rdi, %r12
-       vmovaps (%rsp), %ymm0
+       pushq   %rbx
+       pushq   %r14
+       movq    %rdi, %rbx
+       movq    %rsi, %r14
+       /* ymm0 is already set.  */
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovaps 32(%rsp), %ymm0
-       lea     64(%rsp), %rdi
-       lea     96(%rsp), %rsi
+       vmovaps 48(%rsp), %ymm0
+       leaq    32(%rbx), %rdi
+       leaq    32(%r14), %rsi
         call    HIDDEN_JUMPTARGET(\callee)
-       vmovaps 64(%rsp), %ymm0
-       vmovaps 96(%rsp), %ymm1
-       vmovaps %ymm0, 32(%r12)
-       vmovaps %ymm1, 32(%r13)
-       addq    $176, %rsp
-       popq    %r13
-       popq    %r12
+       popq    %r14
+       popq    %rbx
         movq    %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq    %rbp
author	Noah Goldstein <goldstein.w.n@gmail.com>
	Sat, 19 Nov 2022 00:13:31 +0000 (16:13 -0800)
committer	Noah Goldstein <goldstein.w.n@gmail.com>
	Mon, 28 Nov 2022 04:22:49 +0000 (20:22 -0800)
sysdeps/x86_64/fpu/svml_d_wrapper_impl.h		patch \| blob \| blame \| history
sysdeps/x86_64/fpu/svml_s_wrapper_impl.h		patch \| blob \| blame \| history