x86-64: Add vector acos/acosf implementation to libmvec

author Sunil K Pandey <skpgkp2@gmail.com>

Wed, 22 Dec 2021 14:20:41 +0000 (06:20 -0800)

committer Sunil K Pandey <skpgkp2@gmail.com>

Wed, 22 Dec 2021 21:03:14 +0000 (13:03 -0800)
author Sunil K Pandey <skpgkp2@gmail.com>
Wed, 22 Dec 2021 14:20:41 +0000 (06:20 -0800)
committer Sunil K Pandey <skpgkp2@gmail.com>
Wed, 22 Dec 2021 21:03:14 +0000 (13:03 -0800)
diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h

index b80ff332a0829ac940541ba2b6a98360f1dd8255..2ccdd1fc532629747384ed92da126d54b159e155 100644 (file)
--- a/bits/libm-simd-decl-stubs.h
+++ b/bits/libm-simd-decl-stubs.h
@@ -98,4 +98,15 @@
  #define __DECL_SIMD_powf32x
  #define __DECL_SIMD_powf64x
  #define __DECL_SIMD_powf128x
+
+#define __DECL_SIMD_acos
+#define __DECL_SIMD_acosf
+#define __DECL_SIMD_acosl
+#define __DECL_SIMD_acosf16
+#define __DECL_SIMD_acosf32
+#define __DECL_SIMD_acosf64
+#define __DECL_SIMD_acosf128
+#define __DECL_SIMD_acosf32x
+#define __DECL_SIMD_acosf64x
+#define __DECL_SIMD_acosf128x
  #endif
diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h

index da4cf4e10cc71bb5ff308702337ea28bef5d0f85..2cc6654208367b3397965fbca1e7b49e8952afe5 100644 (file)
--- a/math/bits/mathcalls.h
+++ b/math/bits/mathcalls.h
@@ -50,7 +50,7 @@
  /* Trigonometric functions.  */
  
  /* Arc cosine of X.  */
-__MATHCALL (acos,, (_Mdouble_ __x));
+__MATHCALL_VEC (acos,, (_Mdouble_ __x));
  /* Arc sine of X.  */
  __MATHCALL (asin,, (_Mdouble_ __x));
  /* Arc tangent of X.  */
diff --git a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist

index 363d4ace1e6248eef3cbbcecfa55d4f7863979b6..b37b55777e279b65820b1b8330f5b2948f60f69b 100644 (file)
--- a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
@@ -46,3 +46,11 @@ GLIBC_2.22 _ZGVeN8v_log F
  GLIBC_2.22 _ZGVeN8v_sin F
  GLIBC_2.22 _ZGVeN8vv_pow F
  GLIBC_2.22 _ZGVeN8vvv_sincos F
+GLIBC_2.35 _ZGVbN2v_acos F
+GLIBC_2.35 _ZGVbN4v_acosf F
+GLIBC_2.35 _ZGVcN4v_acos F
+GLIBC_2.35 _ZGVcN8v_acosf F
+GLIBC_2.35 _ZGVdN4v_acos F
+GLIBC_2.35 _ZGVdN8v_acosf F
+GLIBC_2.35 _ZGVeN16v_acosf F
+GLIBC_2.35 _ZGVeN8v_acos F
diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86/fpu/bits/math-vector.h

index dc0bfb3705526e1db155b64be9d16020daf10aa9..dabb74cbb93e2655f3b95550a1bffb68632ffcdb 100644 (file)
--- a/sysdeps/x86/fpu/bits/math-vector.h
+++ b/sysdeps/x86/fpu/bits/math-vector.h
@@ -58,6 +58,10 @@
  #  define __DECL_SIMD_pow __DECL_SIMD_x86_64
  #  undef __DECL_SIMD_powf
  #  define __DECL_SIMD_powf __DECL_SIMD_x86_64
+#  undef __DECL_SIMD_acos
+#  define __DECL_SIMD_acos __DECL_SIMD_x86_64
+#  undef __DECL_SIMD_acosf
+#  define __DECL_SIMD_acosf __DECL_SIMD_x86_64
  
  # endif
  #endif
diff --git a/sysdeps/x86/fpu/finclude/math-vector-fortran.h b/sysdeps/x86/fpu/finclude/math-vector-fortran.h

index 311bb4e391c4fd0a096444078e8c33a7217183f4..4bcbd1fbce446e384931d07c4d8ef52160d60a46 100644 (file)
--- a/sysdeps/x86/fpu/finclude/math-vector-fortran.h
+++ b/sysdeps/x86/fpu/finclude/math-vector-fortran.h
@@ -28,6 +28,8 @@
  !GCC$ builtin (expf) attributes simd (notinbranch) if('x86_64')
  !GCC$ builtin (pow) attributes simd (notinbranch) if('x86_64')
  !GCC$ builtin (powf) attributes simd (notinbranch) if('x86_64')
+!GCC$ builtin (acos) attributes simd (notinbranch) if('x86_64')
+!GCC$ builtin (acosf) attributes simd (notinbranch) if('x86_64')
  
  !GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
  !GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
@@ -41,3 +43,5 @@
  !GCC$ builtin (expf) attributes simd (notinbranch) if('x32')
  !GCC$ builtin (pow) attributes simd (notinbranch) if('x32')
  !GCC$ builtin (powf) attributes simd (notinbranch) if('x32')
+!GCC$ builtin (acos) attributes simd (notinbranch) if('x32')
+!GCC$ builtin (acosf) attributes simd (notinbranch) if('x32')
diff --git a/sysdeps/x86_64/fpu/Makeconfig b/sysdeps/x86_64/fpu/Makeconfig

index b0e3bf7887bb62e5de79a6a985415a97ef4641d2..7acf1f306cc1740680e8566371d46d44d00420b9 100644 (file)
--- a/sysdeps/x86_64/fpu/Makeconfig
+++ b/sysdeps/x86_64/fpu/Makeconfig
@@ -22,6 +22,7 @@ postclean-generated += libmvec.mk
  
  # Define for both math and mathvec directories.
  libmvec-funcs = \
+  acos \
    cos \
    exp \
    log \
diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions

index 08132045d6b9a355ef29ca4eb198ce88f011051e..2985fe7ca7b9da272e270f10a62f0765e64c7213 100644 (file)
--- a/sysdeps/x86_64/fpu/Versions
+++ b/sysdeps/x86_64/fpu/Versions
@@ -13,4 +13,8 @@ libmvec {
      _ZGVbN4vv_powf; _ZGVcN8vv_powf; _ZGVdN8vv_powf; _ZGVeN16vv_powf;
      _ZGVbN4vvv_sincosf; _ZGVcN8vvv_sincosf; _ZGVdN8vvv_sincosf; _ZGVeN16vvv_sincosf;
    }
+  GLIBC_2.35 {
+    _ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
+    _ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
+  }
  }
diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps

index 1c75f0ead46819a74ca8fd78e1bdfb322c7c0e95..6c12976c824492789001c4592641f6fde3785459 100644 (file)
--- a/sysdeps/x86_64/fpu/libm-test-ulps
+++ b/sysdeps/x86_64/fpu/libm-test-ulps
@@ -25,6 +25,26 @@ float: 1
  float128: 1
  ldouble: 2
  
+Function: "acos_vlen16":
+float: 1
+
+Function: "acos_vlen2":
+double: 1
+
+Function: "acos_vlen4":
+double: 1
+float: 2
+
+Function: "acos_vlen4_avx2":
+double: 1
+
+Function: "acos_vlen8":
+double: 1
+float: 2
+
+Function: "acos_vlen8_avx2":
+float: 1
+
  Function: "acosh":
  double: 2
  float: 2
diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h

new file mode 100644 (file)

index 0000000..3aed563
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h
@@ -0,0 +1,39 @@
+/* Common definition for libmathvec ifunc selections optimized with
+   AVX512.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+#undef PASTER2
+#define PASTER2(x,y)   x##_##y
+
+extern void REDIRECT_NAME (void);
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_wrapper) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (skx) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (!CPU_FEATURES_ARCH_P (cpu_features, MathVec_Prefer_No_AVX512)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ))
+    return OPTIMIZE (skx);
+
+  return OPTIMIZE (avx2_wrapper);
+}
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S

new file mode 100644 (file)

index 0000000..25fb8d0
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S
@@ -0,0 +1,20 @@
+/* SSE2 version of vectorized acos, vector length is 2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVbN2v_acos _ZGVbN2v_acos_sse2
+#include "../svml_d_acos2_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c

new file mode 100644 (file)

index 0000000..5ba5d6f
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c
@@ -0,0 +1,27 @@
+/* Multiple versions of vectorized acos, vector length is 2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVbN2v_acos
+#include "ifunc-mathvec-sse4_1.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVbN2v_acos, __GI__ZGVbN2v_acos, __redirect__ZGVbN2v_acos)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S

new file mode 100644 (file)

index 0000000..aea4527
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S
@@ -0,0 +1,303 @@
+/* Function acos vectorized with SSE4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ */
+
+/* Offsets for data table __svml_dacos_data_internal
+ */
+#define SgnBit                         0
+#define OneHalf                        16
+#define SmallNorm                      32
+#define MOne                           48
+#define Two                            64
+#define sqrt_coeff                     80
+#define poly_coeff                     144
+#define PiH                            336
+#define Pi2H                           352
+
+#include <sysdep.h>
+
+        .text
+       .section .text.sse4,"ax",@progbits
+ENTRY(_ZGVbN2v_acos_sse4)
+        subq      $72, %rsp
+        cfi_def_cfa_offset(80)
+        movaps    %xmm0, %xmm5
+        movups    __svml_dacos_data_internal(%rip), %xmm3
+        movups    OneHalf+__svml_dacos_data_internal(%rip), %xmm6
+
+/* x = -|arg| */
+        movaps    %xmm3, %xmm4
+        orps      %xmm5, %xmm4
+
+/* Y = 0.5 + 0.5*(-x) */
+        movaps    %xmm6, %xmm7
+        mulpd     %xmm4, %xmm7
+        addpd     %xmm7, %xmm6
+
+/* S ~ 2*sqrt(Y) */
+        cvtpd2ps  %xmm6, %xmm9
+        movlhps   %xmm9, %xmm9
+
+/* x^2 */
+        movaps    %xmm4, %xmm0
+        rsqrtps   %xmm9, %xmm10
+        mulpd     %xmm4, %xmm0
+        cvtps2pd  %xmm10, %xmm11
+        minpd     %xmm6, %xmm0
+        movaps    %xmm6, %xmm1
+        movaps    %xmm0, %xmm2
+        cmpltpd   SmallNorm+__svml_dacos_data_internal(%rip), %xmm1
+        cmpnltpd  %xmm6, %xmm2
+        addpd     %xmm6, %xmm6
+        andnps    %xmm11, %xmm1
+        movaps    %xmm0, %xmm11
+        movaps    %xmm1, %xmm12
+        andps     %xmm5, %xmm3
+        mulpd     %xmm1, %xmm12
+        mulpd     %xmm6, %xmm1
+        mulpd     %xmm12, %xmm6
+        mulpd     %xmm0, %xmm11
+        subpd     Two+__svml_dacos_data_internal(%rip), %xmm6
+        movups    sqrt_coeff+__svml_dacos_data_internal(%rip), %xmm13
+        movaps    %xmm6, %xmm14
+        mulpd     %xmm6, %xmm13
+        mulpd     %xmm1, %xmm14
+        addpd     sqrt_coeff+16+__svml_dacos_data_internal(%rip), %xmm13
+        mulpd     %xmm6, %xmm13
+        addpd     sqrt_coeff+32+__svml_dacos_data_internal(%rip), %xmm13
+        mulpd     %xmm13, %xmm6
+
+/* polynomial */
+        movups    poly_coeff+__svml_dacos_data_internal(%rip), %xmm15
+        movaps    %xmm11, %xmm7
+        mulpd     %xmm0, %xmm15
+        addpd     sqrt_coeff+48+__svml_dacos_data_internal(%rip), %xmm6
+        addpd     poly_coeff+16+__svml_dacos_data_internal(%rip), %xmm15
+        mulpd     %xmm11, %xmm7
+        mulpd     %xmm6, %xmm14
+        mulpd     %xmm11, %xmm15
+        subpd     %xmm14, %xmm1
+        movups    MOne+__svml_dacos_data_internal(%rip), %xmm8
+        andps     %xmm2, %xmm1
+
+/* NaN processed in special branch (so wind test passed) */
+        cmpnlepd  %xmm4, %xmm8
+        movmskpd  %xmm8, %edx
+
+/* X<X^2 iff X<0 */
+        movaps    %xmm5, %xmm12
+        movups    poly_coeff+32+__svml_dacos_data_internal(%rip), %xmm8
+        movaps    %xmm2, %xmm13
+        movups    poly_coeff+64+__svml_dacos_data_internal(%rip), %xmm6
+        mulpd     %xmm0, %xmm8
+        mulpd     %xmm0, %xmm6
+        addpd     poly_coeff+48+__svml_dacos_data_internal(%rip), %xmm8
+        addpd     poly_coeff+80+__svml_dacos_data_internal(%rip), %xmm6
+        cmpltpd   %xmm0, %xmm12
+        addpd     %xmm15, %xmm8
+        mulpd     %xmm11, %xmm6
+        mulpd     %xmm7, %xmm8
+        movups    poly_coeff+96+__svml_dacos_data_internal(%rip), %xmm9
+        mulpd     %xmm0, %xmm9
+        addpd     poly_coeff+112+__svml_dacos_data_internal(%rip), %xmm9
+        addpd     %xmm6, %xmm9
+        movups    poly_coeff+128+__svml_dacos_data_internal(%rip), %xmm10
+        movaps    %xmm2, %xmm6
+        mulpd     %xmm0, %xmm10
+        addpd     %xmm8, %xmm9
+        addpd     poly_coeff+144+__svml_dacos_data_internal(%rip), %xmm10
+        mulpd     %xmm11, %xmm9
+        movups    poly_coeff+160+__svml_dacos_data_internal(%rip), %xmm14
+        andnps    %xmm4, %xmm6
+        addpd     %xmm9, %xmm10
+        mulpd     %xmm0, %xmm14
+        mulpd     %xmm10, %xmm11
+        addpd     poly_coeff+176+__svml_dacos_data_internal(%rip), %xmm14
+        addpd     %xmm11, %xmm14
+        mulpd     %xmm0, %xmm14
+        orps      %xmm1, %xmm6
+        pxor      %xmm3, %xmm6
+        mulpd     %xmm6, %xmm14
+        movups    PiH+__svml_dacos_data_internal(%rip), %xmm0
+        andps     %xmm2, %xmm0
+        andnps    Pi2H+__svml_dacos_data_internal(%rip), %xmm13
+        andps     %xmm12, %xmm0
+        addpd     %xmm13, %xmm0
+        addpd     %xmm14, %xmm6
+        addpd     %xmm6, %xmm0
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+                                # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm5
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        addq      $72, %rsp
+        cfi_def_cfa_offset(8)
+        ret
+        cfi_def_cfa_offset(80)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        movups    %xmm5, 32(%rsp)
+        movups    %xmm0, 48(%rsp)
+                                # LOE rbx rbp r12 r13 r14 r15 edx
+
+        xorl      %eax, %eax
+        movq      %r12, 16(%rsp)
+        cfi_offset(12, -64)
+        movl      %eax, %r12d
+        movq      %r13, 8(%rsp)
+        cfi_offset(13, -72)
+        movl      %edx, %r13d
+        movq      %r14, (%rsp)
+        cfi_offset(14, -80)
+                                # LOE rbx rbp r15 r12d r13d
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+                                # LOE rbx rbp r15 r12d r13d
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $2, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+                                # LOE rbx rbp r15 r12d r13d
+
+        movq      16(%rsp), %r12
+        cfi_restore(12)
+        movq      8(%rsp), %r13
+        cfi_restore(13)
+        movq      (%rsp), %r14
+        cfi_restore(14)
+        movups    48(%rsp), %xmm0
+
+/* Go to exit */
+        jmp       L(EXIT)
+        cfi_offset(12, -64)
+        cfi_offset(13, -72)
+        cfi_offset(14, -80)
+                                # LOE rbx rbp r12 r13 r14 r15 xmm0
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movsd     32(%rsp,%r14,8), %xmm0
+        call      acos@PLT
+                                # LOE rbx rbp r14 r15 r12d r13d xmm0
+
+        movsd     %xmm0, 48(%rsp,%r14,8)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+                                # LOE rbx rbp r15 r12d r13d
+END(_ZGVbN2v_acos_sse4)
+
+        .section .rodata, "a"
+        .align 16
+
+#ifdef __svml_dacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+        __declspec(align(16)) VUINT32 SgnBit[2][2];
+        __declspec(align(16)) VUINT32 OneHalf[2][2];
+        __declspec(align(16)) VUINT32 SmallNorm[2][2];
+        __declspec(align(16)) VUINT32 MOne[2][2];
+        __declspec(align(16)) VUINT32 Two[2][2];
+        __declspec(align(16)) VUINT32 sqrt_coeff[4][2][2];
+        __declspec(align(16)) VUINT32 poly_coeff[12][2][2];
+        __declspec(align(16)) VUINT32 PiH[2][2];
+        __declspec(align(16)) VUINT32 Pi2H[2][2];
+} __svml_dacos_data_internal;
+#endif
+__svml_dacos_data_internal:
+        /*== SgnBit ==*/
+        .quad 0x8000000000000000, 0x8000000000000000
+        /*== OneHalf ==*/
+        .align 16
+        .quad 0x3fe0000000000000, 0x3fe0000000000000
+        /*== SmallNorm ==*/
+        .align 16
+        .quad 0x3000000000000000, 0x3000000000000000
+        /*== MOne ==*/
+        .align 16
+        .quad 0xbff0000000000000, 0xbff0000000000000
+        /*== Two ==*/
+        .align 16
+        .quad 0x4000000000000000, 0x4000000000000000
+        /*== sqrt_coeff[4] ==*/
+        .align 16
+        .quad 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
+        .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
+        .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
+        .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
+        /*== poly_coeff[12] ==*/
+        .align 16
+        .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
+        .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
+        .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
+        .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
+        .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
+        .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
+        .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
+        .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
+        .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
+        .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
+        .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
+        .quad 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
+        /*== PiH ==*/
+        .align 16
+        .quad 0x400921fb54442d18, 0x400921fb54442d18
+        /*== Pi2H ==*/
+        .align 16
+        .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18
+        .align 16
+        .type  __svml_dacos_data_internal,@object
+        .size  __svml_dacos_data_internal,.-__svml_dacos_data_internal
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S

new file mode 100644 (file)

index 0000000..750f71c
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S
@@ -0,0 +1,20 @@
+/* SSE version of vectorized acos, vector length is 4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVdN4v_acos _ZGVdN4v_acos_sse_wrapper
+#include "../svml_d_acos4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c

new file mode 100644 (file)

index 0000000..6453e7e
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c
@@ -0,0 +1,27 @@
+/* Multiple versions of vectorized acos, vector length is 4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVdN4v_acos
+#include "ifunc-mathvec-avx2.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVdN4v_acos, __GI__ZGVdN4v_acos, __redirect__ZGVdN4v_acos)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S

new file mode 100644 (file)

index 0000000..bf85bdb
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
@@ -0,0 +1,285 @@
+/* Function acos vectorized with AVX2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ */
+
+/* Offsets for data table __svml_dacos_data_internal
+ */
+#define SgnBit                         0
+#define OneHalf                        32
+#define SmallNorm                      64
+#define MOne                           96
+#define Two                            128
+#define sqrt_coeff                     160
+#define poly_coeff                     288
+#define PiH                            672
+#define Pi2H                           704
+
+#include <sysdep.h>
+
+        .text
+       .section .text.avx2,"ax",@progbits
+ENTRY(_ZGVdN4v_acos_avx2)
+        pushq     %rbp
+        cfi_def_cfa_offset(16)
+        movq      %rsp, %rbp
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+        andq      $-32, %rsp
+        subq      $96, %rsp
+        vmovupd   __svml_dacos_data_internal(%rip), %ymm6
+        vmovupd   OneHalf+__svml_dacos_data_internal(%rip), %ymm7
+        vmovapd   %ymm0, %ymm5
+
+/* x = -|arg| */
+        vorpd     %ymm5, %ymm6, %ymm4
+
+/* Y = 0.5 + 0.5*(-x) */
+        vfmadd231pd %ymm4, %ymm7, %ymm7
+
+/* x^2 */
+        vmulpd    %ymm4, %ymm4, %ymm8
+
+/* S ~ 2*sqrt(Y) */
+        vmovupd   sqrt_coeff+__svml_dacos_data_internal(%rip), %ymm0
+        vcmplt_oqpd SmallNorm+__svml_dacos_data_internal(%rip), %ymm7, %ymm12
+        vminpd    %ymm7, %ymm8, %ymm2
+
+/* NaN processed in special branch (so wind test passed) */
+        vcmpnge_uqpd MOne+__svml_dacos_data_internal(%rip), %ymm4, %ymm9
+        vcvtpd2ps %ymm7, %xmm10
+        vmovupd   poly_coeff+64+__svml_dacos_data_internal(%rip), %ymm8
+        vcmpnlt_uqpd %ymm7, %ymm2, %ymm1
+        vrsqrtps  %xmm10, %xmm11
+        vfmadd213pd poly_coeff+96+__svml_dacos_data_internal(%rip), %ymm2, %ymm8
+        vcvtps2pd %xmm11, %ymm13
+        vmovupd   poly_coeff+128+__svml_dacos_data_internal(%rip), %ymm11
+        vandnpd   %ymm13, %ymm12, %ymm14
+        vmulpd    %ymm14, %ymm14, %ymm15
+        vfmadd213pd poly_coeff+160+__svml_dacos_data_internal(%rip), %ymm2, %ymm11
+        vmulpd    %ymm2, %ymm2, %ymm13
+        vmovupd   poly_coeff+256+__svml_dacos_data_internal(%rip), %ymm12
+        vmulpd    %ymm13, %ymm13, %ymm10
+        vfmadd213pd poly_coeff+288+__svml_dacos_data_internal(%rip), %ymm2, %ymm12
+        vandpd    %ymm5, %ymm6, %ymm3
+        vaddpd    %ymm7, %ymm7, %ymm6
+        vmulpd    %ymm6, %ymm14, %ymm7
+        vfmsub213pd Two+__svml_dacos_data_internal(%rip), %ymm15, %ymm6
+        vmovupd   poly_coeff+320+__svml_dacos_data_internal(%rip), %ymm14
+        vfmadd213pd sqrt_coeff+32+__svml_dacos_data_internal(%rip), %ymm6, %ymm0
+        vmulpd    %ymm6, %ymm7, %ymm15
+        vfmadd213pd poly_coeff+352+__svml_dacos_data_internal(%rip), %ymm2, %ymm14
+        vfmadd213pd sqrt_coeff+64+__svml_dacos_data_internal(%rip), %ymm6, %ymm0
+        vfmadd213pd sqrt_coeff+96+__svml_dacos_data_internal(%rip), %ymm6, %ymm0
+
+/* polynomial */
+        vmovupd   poly_coeff+__svml_dacos_data_internal(%rip), %ymm6
+        vfnmadd213pd %ymm7, %ymm15, %ymm0
+        vfmadd213pd poly_coeff+32+__svml_dacos_data_internal(%rip), %ymm2, %ymm6
+        vblendvpd %ymm1, %ymm0, %ymm4, %ymm0
+        vfmadd213pd %ymm8, %ymm13, %ymm6
+        vmovmskpd %ymm9, %edx
+        vmovupd   poly_coeff+192+__svml_dacos_data_internal(%rip), %ymm9
+        vfmadd213pd poly_coeff+224+__svml_dacos_data_internal(%rip), %ymm2, %ymm9
+        vfmadd213pd %ymm9, %ymm13, %ymm11
+        vfmadd213pd %ymm11, %ymm10, %ymm6
+        vfmadd213pd %ymm12, %ymm13, %ymm6
+        vfmadd213pd %ymm14, %ymm13, %ymm6
+        vmulpd    %ymm6, %ymm2, %ymm9
+
+/* X<X^2 iff X<0 */
+        vcmplt_oqpd %ymm2, %ymm5, %ymm6
+        vandpd    PiH+__svml_dacos_data_internal(%rip), %ymm1, %ymm2
+        vandnpd   Pi2H+__svml_dacos_data_internal(%rip), %ymm1, %ymm7
+        vxorpd    %ymm3, %ymm0, %ymm1
+        vfmadd213pd %ymm1, %ymm1, %ymm9
+        vandpd    %ymm6, %ymm2, %ymm2
+        vaddpd    %ymm7, %ymm2, %ymm8
+        vaddpd    %ymm9, %ymm8, %ymm0
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+                                # LOE rbx r12 r13 r14 r15 edx ymm0 ymm5
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        movq      %rbp, %rsp
+        popq      %rbp
+        cfi_def_cfa(7, 8)
+        cfi_restore(6)
+        ret
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        vmovupd   %ymm5, 32(%rsp)
+        vmovupd   %ymm0, 64(%rsp)
+                                # LOE rbx r12 r13 r14 r15 edx ymm0
+
+        xorl      %eax, %eax
+                                # LOE rbx r12 r13 r14 r15 eax edx
+
+        vzeroupper
+        movq      %r12, 16(%rsp)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
+        movl      %eax, %r12d
+        movq      %r13, 8(%rsp)
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
+        movl      %edx, %r13d
+        movq      %r14, (%rsp)
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
+                                # LOE rbx r15 r12d r13d
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+                                # LOE rbx r15 r12d r13d
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $4, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+                                # LOE rbx r15 r12d r13d
+
+        movq      16(%rsp), %r12
+        cfi_restore(12)
+        movq      8(%rsp), %r13
+        cfi_restore(13)
+        movq      (%rsp), %r14
+        cfi_restore(14)
+        vmovupd   64(%rsp), %ymm0
+
+/* Go to exit */
+        jmp       L(EXIT)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
+                                # LOE rbx r12 r13 r14 r15 ymm0
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movsd     32(%rsp,%r14,8), %xmm0
+        call      acos@PLT
+                                # LOE rbx r14 r15 r12d r13d xmm0
+
+        movsd     %xmm0, 64(%rsp,%r14,8)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+                                # LOE rbx r15 r12d r13d
+END(_ZGVdN4v_acos_avx2)
+
+        .section .rodata, "a"
+        .align 32
+
+#ifdef __svml_dacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+        __declspec(align(32)) VUINT32 SgnBit[4][2];
+        __declspec(align(32)) VUINT32 OneHalf[4][2];
+        __declspec(align(32)) VUINT32 SmallNorm[4][2];
+        __declspec(align(32)) VUINT32 MOne[4][2];
+        __declspec(align(32)) VUINT32 Two[4][2];
+        __declspec(align(32)) VUINT32 sqrt_coeff[4][4][2];
+        __declspec(align(32)) VUINT32 poly_coeff[12][4][2];
+        __declspec(align(32)) VUINT32 PiH[4][2];
+        __declspec(align(32)) VUINT32 Pi2H[4][2];
+} __svml_dacos_data_internal;
+#endif
+__svml_dacos_data_internal:
+        /*== SgnBit ==*/
+        .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
+        /*== OneHalf ==*/
+        .align 32
+        .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
+        /*== SmallNorm ==*/
+        .align 32
+        .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
+        /*== MOne ==*/
+        .align 32
+        .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
+        /*== Two ==*/
+        .align 32
+        .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
+        /*== sqrt_coeff[4] ==*/
+        .align 32
+        .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
+        .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
+        .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
+        .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
+        /*== poly_coeff[12] ==*/
+        .align 32
+        .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
+        .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
+        .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
+        .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
+        .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
+        .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
+        .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
+        .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
+        .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
+        .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
+        .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
+        .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
+        /*== PiH ==*/
+        .align 32
+        .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18
+        /*== Pi2H ==*/
+        .align 32
+        .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
+        .align 32
+        .type  __svml_dacos_data_internal,@object
+        .size  __svml_dacos_data_internal,.-__svml_dacos_data_internal
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S

new file mode 100644 (file)

index 0000000..4d64fd1
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S
@@ -0,0 +1,20 @@
+/* AVX2 version of vectorized acos, vector length is 8.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVeN8v_acos _ZGVeN8v_acos_avx2_wrapper
+#include "../svml_d_acos8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c

new file mode 100644 (file)

index 0000000..1e7d186
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c
@@ -0,0 +1,27 @@
+/* Multiple versions of vectorized acos, vector length is 8.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVeN8v_acos
+#include "ifunc-mathvec-avx512-skx.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVeN8v_acos, __GI__ZGVeN8v_acos, __redirect__ZGVeN8v_acos)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S

new file mode 100644 (file)

index 0000000..521ff73
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
@@ -0,0 +1,307 @@
+/* Function acos vectorized with AVX-512.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ */
+
+/* Offsets for data table __svml_dacos_data_internal
+ */
+#define SgnBit                         0
+#define OneHalf                        64
+#define SmallNorm                      128
+#define MOne                           192
+#define Two                            256
+#define sqrt_coeff_1                   320
+#define sqrt_coeff_2                   384
+#define sqrt_coeff_3                   448
+#define sqrt_coeff_4                   512
+#define poly_coeff_1                   576
+#define poly_coeff_2                   640
+#define poly_coeff_3                   704
+#define poly_coeff_4                   768
+#define poly_coeff_5                   832
+#define poly_coeff_6                   896
+#define poly_coeff_7                   960
+#define poly_coeff_8                   1024
+#define poly_coeff_9                   1088
+#define poly_coeff_10                  1152
+#define poly_coeff_11                  1216
+#define poly_coeff_12                  1280
+#define PiH                            1344
+#define Pi2H                           1408
+
+#include <sysdep.h>
+
+        .text
+       .section .text.evex512,"ax",@progbits
+ENTRY(_ZGVeN8v_acos_skx)
+        pushq     %rbp
+        cfi_def_cfa_offset(16)
+        movq      %rsp, %rbp
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+        andq      $-64, %rsp
+        subq      $192, %rsp
+        vmovups   __svml_dacos_data_internal(%rip), %zmm7
+        vmovups   OneHalf+__svml_dacos_data_internal(%rip), %zmm8
+
+/* S ~ 2*sqrt(Y) */
+        vmovups   SmallNorm+__svml_dacos_data_internal(%rip), %zmm11
+        vmovups   Two+__svml_dacos_data_internal(%rip), %zmm14
+        vmovups   sqrt_coeff_1+__svml_dacos_data_internal(%rip), %zmm15
+        vmovups   sqrt_coeff_2+__svml_dacos_data_internal(%rip), %zmm2
+        vmovups   sqrt_coeff_3+__svml_dacos_data_internal(%rip), %zmm1
+        vmovups   MOne+__svml_dacos_data_internal(%rip), %zmm10
+        vmovaps   %zmm0, %zmm6
+
+/* x = -|arg| */
+        vorpd     %zmm6, %zmm7, %zmm5
+        vandpd    %zmm6, %zmm7, %zmm4
+
+/* Y = 0.5 + 0.5*(-x) */
+        vfmadd231pd {rn-sae}, %zmm5, %zmm8, %zmm8
+
+/* x^2 */
+        vmulpd    {rn-sae}, %zmm5, %zmm5, %zmm9
+        vrsqrt14pd %zmm8, %zmm12
+        vcmppd    $17, {sae}, %zmm11, %zmm8, %k1
+        vcmppd    $17, {sae}, %zmm10, %zmm5, %k0
+        vmovups   poly_coeff_5+__svml_dacos_data_internal(%rip), %zmm10
+        vmovups   poly_coeff_7+__svml_dacos_data_internal(%rip), %zmm11
+        vminpd    {sae}, %zmm8, %zmm9, %zmm3
+        vmovups   poly_coeff_3+__svml_dacos_data_internal(%rip), %zmm9
+        vxorpd    %zmm12, %zmm12, %zmm12{%k1}
+        vaddpd    {rn-sae}, %zmm8, %zmm8, %zmm0
+        vcmppd    $21, {sae}, %zmm8, %zmm3, %k4
+
+/* X<X^2 iff X<0 */
+        vcmppd    $17, {sae}, %zmm3, %zmm6, %k2
+        vmulpd    {rn-sae}, %zmm12, %zmm12, %zmm13
+        vmulpd    {rn-sae}, %zmm12, %zmm0, %zmm7
+        vmovups   poly_coeff_4+__svml_dacos_data_internal(%rip), %zmm12
+
+/* polynomial */
+        vmovups   poly_coeff_1+__svml_dacos_data_internal(%rip), %zmm8
+        vfmsub213pd {rn-sae}, %zmm14, %zmm13, %zmm0
+        vmovups   sqrt_coeff_4+__svml_dacos_data_internal(%rip), %zmm13
+        vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm12
+        vmovups   poly_coeff_11+__svml_dacos_data_internal(%rip), %zmm9
+        vfmadd231pd {rn-sae}, %zmm0, %zmm15, %zmm2
+        vmovups   poly_coeff_9+__svml_dacos_data_internal(%rip), %zmm15
+        vmulpd    {rn-sae}, %zmm0, %zmm7, %zmm14
+        vfmadd213pd {rn-sae}, %zmm1, %zmm0, %zmm2
+        vmovups   poly_coeff_2+__svml_dacos_data_internal(%rip), %zmm1
+        kmovw     %k0, %edx
+        vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm2
+        vfmadd231pd {rn-sae}, %zmm3, %zmm8, %zmm1
+        vmovups   poly_coeff_10+__svml_dacos_data_internal(%rip), %zmm8
+        vmulpd    {rn-sae}, %zmm3, %zmm3, %zmm0
+        vfnmadd213pd {rn-sae}, %zmm7, %zmm14, %zmm2
+        vmovups   poly_coeff_6+__svml_dacos_data_internal(%rip), %zmm7
+        vfmadd231pd {rn-sae}, %zmm3, %zmm15, %zmm8
+        vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm1
+        vblendmpd %zmm2, %zmm5, %zmm2{%k4}
+        vfmadd231pd {rn-sae}, %zmm3, %zmm10, %zmm7
+        vmovups   poly_coeff_8+__svml_dacos_data_internal(%rip), %zmm10
+        vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10
+        vmovups   poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11
+        kandw     %k4, %k2, %k3
+        vfmadd213pd {rn-sae}, %zmm10, %zmm0, %zmm7
+        vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm11
+        vmulpd    {rn-sae}, %zmm0, %zmm0, %zmm10
+        vfmadd213pd {rn-sae}, %zmm7, %zmm10, %zmm1
+        vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm1
+        vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1
+        vmovups   Pi2H+__svml_dacos_data_internal(%rip), %zmm0
+        vmulpd    {rn-sae}, %zmm3, %zmm1, %zmm1
+        vxorpd    %zmm4, %zmm2, %zmm3
+        vxorpd    %zmm0, %zmm0, %zmm0{%k4}
+        vfmadd213pd {rn-sae}, %zmm3, %zmm3, %zmm1
+        vorpd     PiH+__svml_dacos_data_internal(%rip), %zmm0, %zmm0{%k3}
+        vaddpd    {rn-sae}, %zmm1, %zmm0, %zmm0
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm6
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        movq      %rbp, %rsp
+        popq      %rbp
+        cfi_def_cfa(7, 8)
+        cfi_restore(6)
+        ret
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        vmovups   %zmm6, 64(%rsp)
+        vmovups   %zmm0, 128(%rsp)
+                                # LOE rbx r12 r13 r14 r15 edx zmm0
+
+        xorl      %eax, %eax
+                                # LOE rbx r12 r13 r14 r15 eax edx
+
+        vzeroupper
+        movq      %r12, 16(%rsp)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
+        movl      %eax, %r12d
+        movq      %r13, 8(%rsp)
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
+        movl      %edx, %r13d
+        movq      %r14, (%rsp)
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+                                # LOE rbx r15 r12d r13d
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+                                # LOE rbx r15 r12d r13d
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $8, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+                                # LOE rbx r15 r12d r13d
+
+        movq      16(%rsp), %r12
+        cfi_restore(12)
+        movq      8(%rsp), %r13
+        cfi_restore(13)
+        movq      (%rsp), %r14
+        cfi_restore(14)
+        vmovups   128(%rsp), %zmm0
+
+/* Go to exit */
+        jmp       L(EXIT)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+                                # LOE rbx r12 r13 r14 r15 zmm0
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movsd     64(%rsp,%r14,8), %xmm0
+        call      acos@PLT
+                                # LOE rbx r14 r15 r12d r13d xmm0
+
+        movsd     %xmm0, 128(%rsp,%r14,8)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+                                # LOE rbx r15 r12d r13d
+END(_ZGVeN8v_acos_skx)
+
+        .section .rodata, "a"
+        .align 64
+
+#ifdef __svml_dacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+        __declspec(align(64)) VUINT32 SgnBit[8][2];
+        __declspec(align(64)) VUINT32 OneHalf[8][2];
+        __declspec(align(64)) VUINT32 SmallNorm[8][2];
+        __declspec(align(64)) VUINT32 MOne[8][2];
+        __declspec(align(64)) VUINT32 Two[8][2];
+        __declspec(align(64)) VUINT32 sqrt_coeff[4][8][2];
+        __declspec(align(64)) VUINT32 poly_coeff[12][8][2];
+        __declspec(align(64)) VUINT32 PiH[8][2];
+        __declspec(align(64)) VUINT32 Pi2H[8][2];
+} __svml_dacos_data_internal;
+#endif
+__svml_dacos_data_internal:
+        /*== SgnBit ==*/
+        .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
+        /*== OneHalf ==*/
+        .align 64
+        .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
+        /*== SmallNorm ==*/
+        .align 64
+        .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
+        /*== MOne ==*/
+        .align 64
+        .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
+        /*== Two ==*/
+        .align 64
+        .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
+        /*== sqrt_coeff[4] ==*/
+        .align 64
+        .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
+        .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
+        .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
+        .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
+        /*== poly_coeff[12] ==*/
+        .align 64
+        .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
+        .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
+        .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
+        .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
+        .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
+        .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
+        .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
+        .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
+        .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
+        .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
+        .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
+        .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
+        /*== PiH ==*/
+        .align 64
+        .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18
+        /*== Pi2H ==*/
+        .align 64
+        .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
+        .align 64
+        .type  __svml_dacos_data_internal,@object
+        .size  __svml_dacos_data_internal,.-__svml_dacos_data_internal
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S

new file mode 100644 (file)

index 0000000..1ff0cfc
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S
@@ -0,0 +1,20 @@
+/* AVX2 version of vectorized acosf.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVeN16v_acosf _ZGVeN16v_acosf_avx2_wrapper
+#include "../svml_s_acosf16_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c

new file mode 100644 (file)

index 0000000..fcf0578
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c
@@ -0,0 +1,28 @@
+/* Multiple versions of vectorized acosf, vector length is 16.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVeN16v_acosf
+#include "ifunc-mathvec-avx512-skx.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVeN16v_acosf, __GI__ZGVeN16v_acosf,
+              __redirect__ZGVeN16v_acosf)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S

new file mode 100644 (file)

index 0000000..36f08c4
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
@@ -0,0 +1,271 @@
+/* Function acosf vectorized with AVX-512.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ *
+ */
+
+/* Offsets for data table __svml_sacos_data_internal
+ */
+#define SgnBit                         0
+#define OneHalf                        64
+#define SmallNorm                      128
+#define MOne                           192
+#define Two                            256
+#define sqrt_coeff_1                   320
+#define sqrt_coeff_2                   384
+#define poly_coeff_1                   448
+#define poly_coeff_2                   512
+#define poly_coeff_3                   576
+#define poly_coeff_4                   640
+#define poly_coeff_5                   704
+#define Pi2H                           768
+#define PiH                            832
+
+#include <sysdep.h>
+
+        .text
+       .section .text.exex512,"ax",@progbits
+ENTRY(_ZGVeN16v_acosf_skx)
+        pushq     %rbp
+        cfi_def_cfa_offset(16)
+        movq      %rsp, %rbp
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+        andq      $-64, %rsp
+        subq      $192, %rsp
+        vmovups   __svml_sacos_data_internal(%rip), %zmm5
+        vmovups   OneHalf+__svml_sacos_data_internal(%rip), %zmm6
+
+/* SQ ~ 2*sqrt(Y) */
+        vmovups   SmallNorm+__svml_sacos_data_internal(%rip), %zmm9
+        vmovups   MOne+__svml_sacos_data_internal(%rip), %zmm8
+        vmovups   Two+__svml_sacos_data_internal(%rip), %zmm12
+        vmovups   sqrt_coeff_1+__svml_sacos_data_internal(%rip), %zmm13
+        vmovaps   %zmm0, %zmm4
+
+/* x = -|arg| */
+        vorps     %zmm4, %zmm5, %zmm3
+        vandps    %zmm4, %zmm5, %zmm2
+        vmovups   sqrt_coeff_2+__svml_sacos_data_internal(%rip), %zmm0
+
+/* Y = 0.5 + 0.5*(-x) */
+        vfmadd231ps {rn-sae}, %zmm3, %zmm6, %zmm6
+
+/* x^2 */
+        vmulps    {rn-sae}, %zmm3, %zmm3, %zmm7
+        vrsqrt14ps %zmm6, %zmm10
+        vcmpps    $17, {sae}, %zmm9, %zmm6, %k1
+        vcmpps    $22, {sae}, %zmm3, %zmm8, %k0
+        vmovups   poly_coeff_4+__svml_sacos_data_internal(%rip), %zmm9
+        vminps    {sae}, %zmm6, %zmm7, %zmm1
+        vmovups   poly_coeff_3+__svml_sacos_data_internal(%rip), %zmm7
+        vxorps    %zmm10, %zmm10, %zmm10{%k1}
+        vaddps    {rn-sae}, %zmm6, %zmm6, %zmm14
+        vmulps    {rn-sae}, %zmm1, %zmm1, %zmm8
+        vmulps    {rn-sae}, %zmm10, %zmm10, %zmm11
+        vmulps    {rn-sae}, %zmm10, %zmm14, %zmm5
+        vcmpps    $21, {sae}, %zmm6, %zmm1, %k4
+
+/* X<X^2 iff X<0 */
+        vcmpps    $17, {sae}, %zmm1, %zmm4, %k2
+
+/* polynomial */
+        vmovups   poly_coeff_1+__svml_sacos_data_internal(%rip), %zmm6
+        vfmsub213ps {rn-sae}, %zmm12, %zmm11, %zmm14
+        vmovups   poly_coeff_2+__svml_sacos_data_internal(%rip), %zmm11
+        vfmadd231ps {rn-sae}, %zmm1, %zmm7, %zmm9
+        vmovups   poly_coeff_5+__svml_sacos_data_internal(%rip), %zmm10
+        vmovups   Pi2H+__svml_sacos_data_internal(%rip), %zmm12
+        vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm0
+        vfmadd231ps {rn-sae}, %zmm1, %zmm6, %zmm11
+        vmulps    {rn-sae}, %zmm14, %zmm5, %zmm15
+        vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm11
+        vxorps    %zmm12, %zmm12, %zmm12{%k4}
+        vfnmadd213ps {rn-sae}, %zmm5, %zmm15, %zmm0
+        vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm11
+        kmovw     %k0, %edx
+        vmulps    {rn-sae}, %zmm1, %zmm11, %zmm13
+        vblendmps %zmm0, %zmm3, %zmm0{%k4}
+        vxorps    %zmm2, %zmm0, %zmm1
+        kandw     %k4, %k2, %k3
+        vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm13
+        vorps     PiH+__svml_sacos_data_internal(%rip), %zmm12, %zmm12{%k3}
+        vaddps    {rn-sae}, %zmm13, %zmm12, %zmm0
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm4
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        movq      %rbp, %rsp
+        popq      %rbp
+        cfi_def_cfa(7, 8)
+        cfi_restore(6)
+        ret
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        vmovups   %zmm4, 64(%rsp)
+        vmovups   %zmm0, 128(%rsp)
+                                # LOE rbx r12 r13 r14 r15 edx zmm0
+
+        xorl      %eax, %eax
+                                # LOE rbx r12 r13 r14 r15 eax edx
+
+        vzeroupper
+        movq      %r12, 16(%rsp)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
+        movl      %eax, %r12d
+        movq      %r13, 8(%rsp)
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
+        movl      %edx, %r13d
+        movq      %r14, (%rsp)
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+                                # LOE rbx r15 r12d r13d
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+                                # LOE rbx r15 r12d r13d
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $16, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+                                # LOE rbx r15 r12d r13d
+
+        movq      16(%rsp), %r12
+        cfi_restore(12)
+        movq      8(%rsp), %r13
+        cfi_restore(13)
+        movq      (%rsp), %r14
+        cfi_restore(14)
+        vmovups   128(%rsp), %zmm0
+
+/* Go to exit */
+        jmp       L(EXIT)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+                                # LOE rbx r12 r13 r14 r15 zmm0
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movss     64(%rsp,%r14,4), %xmm0
+        call      acosf@PLT
+                                # LOE rbx r14 r15 r12d r13d xmm0
+
+        movss     %xmm0, 128(%rsp,%r14,4)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+                                # LOE rbx r15 r12d r13d
+END(_ZGVeN16v_acosf_skx)
+
+        .section .rodata, "a"
+        .align 64
+
+#ifdef __svml_sacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+        __declspec(align(64)) VUINT32 SgnBit[16][1];
+        __declspec(align(64)) VUINT32 OneHalf[16][1];
+        __declspec(align(64)) VUINT32 SmallNorm[16][1];
+        __declspec(align(64)) VUINT32 MOne[16][1];
+        __declspec(align(64)) VUINT32 Two[16][1];
+        __declspec(align(64)) VUINT32 sqrt_coeff[2][16][1];
+        __declspec(align(64)) VUINT32 poly_coeff[5][16][1];
+        __declspec(align(64)) VUINT32 Pi2H[16][1];
+        __declspec(align(64)) VUINT32 PiH[16][1];
+} __svml_sacos_data_internal;
+#endif
+__svml_sacos_data_internal:
+        /*== SgnBit ==*/
+        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
+        /*== OneHalf ==*/
+        .align 64
+        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
+        /*== SmallNorm ==*/
+        .align 64
+        .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
+        /*== MOne ==*/
+        .align 64
+        .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
+        /*== Two ==*/
+        .align 64
+        .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
+        /*== sqrt_coeff[2] ==*/
+        .align 64
+        .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
+        .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
+        /*== poly_coeff[5] ==*/
+        .align 64
+        .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
+        .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
+        .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
+        .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
+        .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
+        /*== Pi2H ==*/
+        .align 64
+        .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
+        /*== PiH ==*/
+        .align 64
+        .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
+        .align 64
+        .type  __svml_sacos_data_internal,@object
+        .size  __svml_sacos_data_internal,.-__svml_sacos_data_internal
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S

new file mode 100644 (file)

index 0000000..f94b3eb
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S
@@ -0,0 +1,20 @@
+/* SSE2 version of vectorized acosf, vector length is 4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVbN4v_acosf _ZGVbN4v_acosf_sse2
+#include "../svml_s_acosf4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c

new file mode 100644 (file)

index 0000000..6f9a5c1
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c
@@ -0,0 +1,28 @@
+/* Multiple versions of vectorized acosf, vector length is 4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVbN4v_acosf
+#include "ifunc-mathvec-sse4_1.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVbN4v_acosf, __GI__ZGVbN4v_acosf,
+              __redirect__ZGVbN4v_acosf)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S

new file mode 100644 (file)

index 0000000..3b7c25a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S
@@ -0,0 +1,270 @@
+/* Function acosf vectorized with SSE4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ *
+ */
+
+/* Offsets for data table __svml_sacos_data_internal
+ */
+#define SgnBit                         0
+#define OneHalf                        16
+#define SmallNorm                      32
+#define MOne                           48
+#define Two                            64
+#define sqrt_coeff                     80
+#define poly_coeff                     112
+#define Pi2H                           192
+#define PiH                            208
+
+#include <sysdep.h>
+
+        .text
+       .section .text.sse4,"ax",@progbits
+ENTRY(_ZGVbN4v_acosf_sse4)
+        subq      $72, %rsp
+        cfi_def_cfa_offset(80)
+
+/* X<X^2 iff X<0 */
+        movaps    %xmm0, %xmm14
+
+/*
+ * 2*sqrt(X) ~ Sh - Sl  (to 24+ bits)
+ * SQ ~ 2*sqrt(X)
+ */
+        movups    __svml_sacos_data_internal(%rip), %xmm3
+        movups    OneHalf+__svml_sacos_data_internal(%rip), %xmm5
+
+/* x = -|arg| */
+        movaps    %xmm3, %xmm4
+        orps      %xmm0, %xmm4
+
+/* Y = 0.5 + 0.5*(-x) */
+        movaps    %xmm5, %xmm6
+        mulps     %xmm4, %xmm6
+
+/* x^2 */
+        movaps    %xmm4, %xmm13
+        mulps     %xmm4, %xmm13
+        addps     %xmm6, %xmm5
+
+/* SQ ~ 2*sqrt(Y) */
+        rsqrtps   %xmm5, %xmm8
+        minps     %xmm5, %xmm13
+        movaps    %xmm5, %xmm2
+        movaps    %xmm13, %xmm1
+        cmpltps   SmallNorm+__svml_sacos_data_internal(%rip), %xmm2
+        cmpnltps  %xmm5, %xmm1
+        cmpltps   %xmm13, %xmm14
+        addps     %xmm5, %xmm5
+        andnps    %xmm8, %xmm2
+        movaps    %xmm13, %xmm11
+        movaps    %xmm2, %xmm9
+        movaps    %xmm1, %xmm6
+        mulps     %xmm2, %xmm9
+        andnps    %xmm4, %xmm6
+        mulps     %xmm5, %xmm2
+        mulps     %xmm13, %xmm11
+        mulps     %xmm9, %xmm5
+        movups    sqrt_coeff+__svml_sacos_data_internal(%rip), %xmm10
+        andps     %xmm0, %xmm3
+
+/* polynomial */
+        movups    poly_coeff+__svml_sacos_data_internal(%rip), %xmm12
+        movaps    %xmm1, %xmm15
+        mulps     %xmm13, %xmm12
+        subps     Two+__svml_sacos_data_internal(%rip), %xmm5
+        mulps     %xmm5, %xmm10
+        addps     poly_coeff+16+__svml_sacos_data_internal(%rip), %xmm12
+        mulps     %xmm2, %xmm5
+        mulps     %xmm11, %xmm12
+        addps     sqrt_coeff+16+__svml_sacos_data_internal(%rip), %xmm10
+        mulps     %xmm5, %xmm10
+        movups    poly_coeff+32+__svml_sacos_data_internal(%rip), %xmm5
+        subps     %xmm10, %xmm2
+        mulps     %xmm13, %xmm5
+        movups    MOne+__svml_sacos_data_internal(%rip), %xmm7
+        andps     %xmm1, %xmm2
+        cmpnleps  %xmm4, %xmm7
+        addps     poly_coeff+48+__svml_sacos_data_internal(%rip), %xmm5
+        movmskps  %xmm7, %edx
+        orps      %xmm2, %xmm6
+        addps     %xmm12, %xmm5
+        mulps     %xmm13, %xmm5
+        pxor      %xmm3, %xmm6
+        movups    PiH+__svml_sacos_data_internal(%rip), %xmm7
+        andps     %xmm1, %xmm7
+        addps     poly_coeff+64+__svml_sacos_data_internal(%rip), %xmm5
+        mulps     %xmm13, %xmm5
+        andps     %xmm14, %xmm7
+        mulps     %xmm6, %xmm5
+        andnps    Pi2H+__svml_sacos_data_internal(%rip), %xmm15
+        addps     %xmm5, %xmm6
+        addps     %xmm15, %xmm7
+        addps     %xmm6, %xmm7
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+                                # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm7
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        movaps    %xmm7, %xmm0
+        addq      $72, %rsp
+        cfi_def_cfa_offset(8)
+        ret
+        cfi_def_cfa_offset(80)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        movups    %xmm0, 32(%rsp)
+        movups    %xmm7, 48(%rsp)
+                                # LOE rbx rbp r12 r13 r14 r15 edx
+
+        xorl      %eax, %eax
+        movq      %r12, 16(%rsp)
+        cfi_offset(12, -64)
+        movl      %eax, %r12d
+        movq      %r13, 8(%rsp)
+        cfi_offset(13, -72)
+        movl      %edx, %r13d
+        movq      %r14, (%rsp)
+        cfi_offset(14, -80)
+                                # LOE rbx rbp r15 r12d r13d
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+                                # LOE rbx rbp r15 r12d r13d
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $4, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+                                # LOE rbx rbp r15 r12d r13d
+
+        movq      16(%rsp), %r12
+        cfi_restore(12)
+        movq      8(%rsp), %r13
+        cfi_restore(13)
+        movq      (%rsp), %r14
+        cfi_restore(14)
+        movups    48(%rsp), %xmm7
+
+/* Go to exit */
+        jmp       L(EXIT)
+        cfi_offset(12, -64)
+        cfi_offset(13, -72)
+        cfi_offset(14, -80)
+                                # LOE rbx rbp r12 r13 r14 r15 xmm7
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movss     32(%rsp,%r14,4), %xmm0
+        call      acosf@PLT
+                                # LOE rbx rbp r14 r15 r12d r13d xmm0
+
+        movss     %xmm0, 48(%rsp,%r14,4)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+                                # LOE rbx rbp r15 r12d r13d
+END(_ZGVbN4v_acosf_sse4)
+
+        .section .rodata, "a"
+        .align 16
+
+#ifdef __svml_sacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+        __declspec(align(16)) VUINT32 SgnBit[4][1];
+        __declspec(align(16)) VUINT32 OneHalf[4][1];
+        __declspec(align(16)) VUINT32 SmallNorm[4][1];
+        __declspec(align(16)) VUINT32 MOne[4][1];
+        __declspec(align(16)) VUINT32 Two[4][1];
+        __declspec(align(16)) VUINT32 sqrt_coeff[2][4][1];
+        __declspec(align(16)) VUINT32 poly_coeff[5][4][1];
+        __declspec(align(16)) VUINT32 Pi2H[4][1];
+        __declspec(align(16)) VUINT32 PiH[4][1];
+} __svml_sacos_data_internal;
+#endif
+__svml_sacos_data_internal:
+        /*== SgnBit ==*/
+        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+        /*== OneHalf ==*/
+        .align 16
+        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
+        /*== SmallNorm ==*/
+        .align 16
+        .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
+        /*== MOne ==*/
+        .align 16
+        .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
+        /*== Two ==*/
+        .align 16
+        .long 0x40000000, 0x40000000, 0x40000000, 0x40000000
+        /*== sqrt_coeff[2] ==*/
+        .align 16
+        .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
+        .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
+        /*== poly_coeff[5] ==*/
+        .align 16
+        .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
+        .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
+        .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
+        .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
+        .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
+        /*== Pi2H ==*/
+        .align 16
+        .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
+        /*== PiH ==*/
+        .align 16
+        .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
+        .align 16
+        .type  __svml_sacos_data_internal,@object
+        .size  __svml_sacos_data_internal,.-__svml_sacos_data_internal
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S

new file mode 100644 (file)

index 0000000..583ef54
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S
@@ -0,0 +1,20 @@
+/* SSE version of vectorized acosf, vector length is 8.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVdN8v_acosf _ZGVdN8v_acosf_sse_wrapper
+#include "../svml_s_acosf8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c

new file mode 100644 (file)

index 0000000..dd360a9
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c
@@ -0,0 +1,28 @@
+/* Multiple versions of vectorized acosf, vector length is 8.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVdN8v_acosf
+#include "ifunc-mathvec-avx2.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVdN8v_acosf, __GI__ZGVdN8v_acosf,
+              __redirect__ZGVdN8v_acosf)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S

new file mode 100644 (file)

index 0000000..bc783e5
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
@@ -0,0 +1,264 @@
+/* Function acosf vectorized with AVX2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ *
+ */
+
+/* Offsets for data table __svml_sacos_data_internal
+ */
+#define SgnBit                         0
+#define OneHalf                        32
+#define SmallNorm                      64
+#define MOne                           96
+#define Two                            128
+#define sqrt_coeff                     160
+#define poly_coeff                     224
+#define Pi2H                           384
+#define PiH                            416
+
+#include <sysdep.h>
+
+        .text
+       .section .text.avx2,"ax",@progbits
+ENTRY(_ZGVdN8v_acosf_avx2)
+        pushq     %rbp
+        cfi_def_cfa_offset(16)
+        movq      %rsp, %rbp
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+        andq      $-32, %rsp
+        subq      $96, %rsp
+
+/*
+ * 2*sqrt(X) ~ Sh - Sl  (to 24+ bits)
+ * SQ ~ 2*sqrt(X)
+ */
+        vmovups   __svml_sacos_data_internal(%rip), %ymm6
+        vmovups   OneHalf+__svml_sacos_data_internal(%rip), %ymm7
+        vmovaps   %ymm0, %ymm5
+
+/* x = -|arg| */
+        vorps     %ymm5, %ymm6, %ymm4
+
+/* Y = 0.5 + 0.5*(-x) */
+        vfmadd231ps %ymm4, %ymm7, %ymm7
+
+/* x^2 */
+        vmulps    %ymm4, %ymm4, %ymm8
+
+/* SQ ~ 2*sqrt(Y) */
+        vmovups   sqrt_coeff+__svml_sacos_data_internal(%rip), %ymm0
+        vcmpnge_uqps MOne+__svml_sacos_data_internal(%rip), %ymm4, %ymm9
+        vcmplt_oqps SmallNorm+__svml_sacos_data_internal(%rip), %ymm7, %ymm10
+        vminps    %ymm7, %ymm8, %ymm2
+        vaddps    %ymm7, %ymm7, %ymm14
+        vrsqrtps  %ymm7, %ymm11
+        vmovups   poly_coeff+64+__svml_sacos_data_internal(%rip), %ymm8
+        vcmpnlt_uqps %ymm7, %ymm2, %ymm1
+        vmulps    %ymm2, %ymm2, %ymm7
+        vfmadd213ps poly_coeff+96+__svml_sacos_data_internal(%rip), %ymm2, %ymm8
+        vmovmskps %ymm9, %edx
+
+/* polynomial */
+        vmovups   poly_coeff+__svml_sacos_data_internal(%rip), %ymm9
+        vandnps   %ymm11, %ymm10, %ymm12
+        vmulps    %ymm12, %ymm12, %ymm13
+        vfmadd213ps poly_coeff+32+__svml_sacos_data_internal(%rip), %ymm2, %ymm9
+
+/* X<X^2 iff X<0 */
+        vcmplt_oqps %ymm2, %ymm5, %ymm10
+        vfmadd213ps %ymm8, %ymm7, %ymm9
+        vandps    %ymm5, %ymm6, %ymm3
+        vmulps    %ymm14, %ymm12, %ymm6
+        vfmsub213ps Two+__svml_sacos_data_internal(%rip), %ymm13, %ymm14
+        vfmadd213ps poly_coeff+128+__svml_sacos_data_internal(%rip), %ymm2, %ymm9
+        vfmadd213ps sqrt_coeff+32+__svml_sacos_data_internal(%rip), %ymm14, %ymm0
+        vmulps    %ymm14, %ymm6, %ymm15
+        vmulps    %ymm9, %ymm2, %ymm14
+        vfnmadd213ps %ymm6, %ymm15, %ymm0
+        vblendvps %ymm1, %ymm0, %ymm4, %ymm0
+        vandps    PiH+__svml_sacos_data_internal(%rip), %ymm1, %ymm2
+        vandnps   Pi2H+__svml_sacos_data_internal(%rip), %ymm1, %ymm12
+        vxorps    %ymm3, %ymm0, %ymm1
+        vfmadd213ps %ymm1, %ymm1, %ymm14
+        vandps    %ymm10, %ymm2, %ymm11
+        vaddps    %ymm12, %ymm11, %ymm13
+        vaddps    %ymm14, %ymm13, %ymm0
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+                                # LOE rbx r12 r13 r14 r15 edx ymm0 ymm5
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        movq      %rbp, %rsp
+        popq      %rbp
+        cfi_def_cfa(7, 8)
+        cfi_restore(6)
+        ret
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        vmovups   %ymm5, 32(%rsp)
+        vmovups   %ymm0, 64(%rsp)
+                                # LOE rbx r12 r13 r14 r15 edx ymm0
+
+        xorl      %eax, %eax
+                                # LOE rbx r12 r13 r14 r15 eax edx
+
+        vzeroupper
+        movq      %r12, 16(%rsp)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
+        movl      %eax, %r12d
+        movq      %r13, 8(%rsp)
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
+        movl      %edx, %r13d
+        movq      %r14, (%rsp)
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
+                                # LOE rbx r15 r12d r13d
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+                                # LOE rbx r15 r12d r13d
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $8, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+                                # LOE rbx r15 r12d r13d
+
+        movq      16(%rsp), %r12
+        cfi_restore(12)
+        movq      8(%rsp), %r13
+        cfi_restore(13)
+        movq      (%rsp), %r14
+        cfi_restore(14)
+        vmovups   64(%rsp), %ymm0
+
+/* Go to exit */
+        jmp       L(EXIT)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
+                                # LOE rbx r12 r13 r14 r15 ymm0
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movss     32(%rsp,%r14,4), %xmm0
+        call      acosf@PLT
+                                # LOE rbx r14 r15 r12d r13d xmm0
+
+        movss     %xmm0, 64(%rsp,%r14,4)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+                                # LOE rbx r15 r12d r13d
+END(_ZGVdN8v_acosf_avx2)
+
+        .section .rodata, "a"
+        .align 32
+
+#ifdef __svml_sacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+        __declspec(align(32)) VUINT32 SgnBit[8][1];
+        __declspec(align(32)) VUINT32 OneHalf[8][1];
+        __declspec(align(32)) VUINT32 SmallNorm[8][1];
+        __declspec(align(32)) VUINT32 MOne[8][1];
+        __declspec(align(32)) VUINT32 Two[8][1];
+        __declspec(align(32)) VUINT32 sqrt_coeff[2][8][1];
+        __declspec(align(32)) VUINT32 poly_coeff[5][8][1];
+        __declspec(align(32)) VUINT32 Pi2H[8][1];
+        __declspec(align(32)) VUINT32 PiH[8][1];
+} __svml_sacos_data_internal;
+#endif
+__svml_sacos_data_internal:
+        /*== SgnBit ==*/
+        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
+        /*== OneHalf ==*/
+        .align 32
+        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
+        /*== SmallNorm ==*/
+        .align 32
+        .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
+        /*== MOne ==*/
+        .align 32
+        .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
+        /*== Two ==*/
+        .align 32
+        .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
+        /*== sqrt_coeff[2] ==*/
+        .align 32
+        .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
+        .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
+        /*== poly_coeff[5] ==*/
+        .align 32
+        .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
+        .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
+        .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
+        .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
+        .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
+        /*== Pi2H ==*/
+        .align 32
+        .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
+        /*== PiH ==*/
+        .align 32
+        .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
+        .align 32
+        .type  __svml_sacos_data_internal,@object
+        .size  __svml_sacos_data_internal,.-__svml_sacos_data_internal
diff --git a/sysdeps/x86_64/fpu/svml_d_acos2_core.S b/sysdeps/x86_64/fpu/svml_d_acos2_core.S

new file mode 100644 (file)

index 0000000..9656478
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_d_acos2_core.S
@@ -0,0 +1,29 @@
+/* Function acos vectorized with SSE2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+       .text
+ENTRY (_ZGVbN2v_acos)
+WRAPPER_IMPL_SSE2 acos
+END (_ZGVbN2v_acos)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN2v_acos)
+#endif
diff --git a/sysdeps/x86_64/fpu/svml_d_acos4_core.S b/sysdeps/x86_64/fpu/svml_d_acos4_core.S

new file mode 100644 (file)

index 0000000..e99cb4a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_d_acos4_core.S
@@ -0,0 +1,29 @@
+/* Function acos vectorized with AVX2, wrapper version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+       .text
+ENTRY (_ZGVdN4v_acos)
+WRAPPER_IMPL_AVX _ZGVbN2v_acos
+END (_ZGVdN4v_acos)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN4v_acos)
+#endif
diff --git a/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S

new file mode 100644 (file)

index 0000000..7cbcbc9
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S
@@ -0,0 +1,25 @@
+/* Function acos vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+       .text
+ENTRY (_ZGVcN4v_acos)
+WRAPPER_IMPL_AVX _ZGVbN2v_acos
+END (_ZGVcN4v_acos)
diff --git a/sysdeps/x86_64/fpu/svml_d_acos8_core.S b/sysdeps/x86_64/fpu/svml_d_acos8_core.S

new file mode 100644 (file)

index 0000000..e26b30d
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_d_acos8_core.S
@@ -0,0 +1,25 @@
+/* Function acos vectorized with AVX-512, wrapper to AVX2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+       .text
+ENTRY (_ZGVeN8v_acos)
+WRAPPER_IMPL_AVX512 _ZGVdN4v_acos
+END (_ZGVeN8v_acos)
diff --git a/sysdeps/x86_64/fpu/svml_s_acosf16_core.S b/sysdeps/x86_64/fpu/svml_s_acosf16_core.S

new file mode 100644 (file)

index 0000000..70e046d
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_acosf16_core.S
@@ -0,0 +1,25 @@
+/* Function acosf vectorized with AVX-512. Wrapper to AVX2 version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+       .text
+ENTRY (_ZGVeN16v_acosf)
+WRAPPER_IMPL_AVX512 _ZGVdN8v_acosf
+END (_ZGVeN16v_acosf)
diff --git a/sysdeps/x86_64/fpu/svml_s_acosf4_core.S b/sysdeps/x86_64/fpu/svml_s_acosf4_core.S

new file mode 100644 (file)

index 0000000..36354b3
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_acosf4_core.S
@@ -0,0 +1,29 @@
+/* Function acosf vectorized with SSE2, wrapper version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+       .text
+ENTRY (_ZGVbN4v_acosf)
+WRAPPER_IMPL_SSE2 acosf
+END (_ZGVbN4v_acosf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN4v_acosf)
+#endif
diff --git a/sysdeps/x86_64/fpu/svml_s_acosf8_core.S b/sysdeps/x86_64/fpu/svml_s_acosf8_core.S

new file mode 100644 (file)

index 0000000..f08864a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_acosf8_core.S
@@ -0,0 +1,29 @@
+/* Function acosf vectorized with AVX2, wrapper version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+       .text
+ENTRY (_ZGVdN8v_acosf)
+WRAPPER_IMPL_AVX _ZGVbN4v_acosf
+END (_ZGVdN8v_acosf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN8v_acosf)
+#endif
diff --git a/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S

new file mode 100644 (file)

index 0000000..f3ed4d8
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S
@@ -0,0 +1,25 @@
+/* Function acosf vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+        .text
+ENTRY (_ZGVcN8v_acosf)
+WRAPPER_IMPL_AVX _ZGVbN4v_acosf
+END (_ZGVcN8v_acosf)
diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c

new file mode 100644 (file)

index 0000000..4f74b42
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-acos.c"
diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c

new file mode 100644 (file)

index 0000000..4f74b42
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-acos.c"
diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c

new file mode 100644 (file)

index 0000000..4f74b42
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-acos.c"
diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos.c

new file mode 100644 (file)

index 0000000..e38b8ce
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos.c
@@ -0,0 +1,3 @@
+#define LIBMVEC_TYPE double
+#define LIBMVEC_FUNC acos
+#include "test-vector-abi-arg1.h"
diff --git a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c

index ed932fc98d0ec769302ef3e25332d4c4bd44450e..0abc7d2021d63fe3dcadbe1d5f4bf489be59c2d6 100644 (file)
--- a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
@@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin)
  VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
  VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
  VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow)
+VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVbN2v_acos)
  
  #define VEC_INT_TYPE __m128i
  
diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c

index 3a6e37044f1d805fc95c890858c1a67d5718dc83..dda093b914212441508c9b772609847f6e1c4ba1 100644 (file)
--- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
@@ -30,6 +30,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVdN4v_sin)
  VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log)
  VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp)
  VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow)
+VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVdN4v_acos)
  
  #ifndef __ILP32__
  # define VEC_INT_TYPE __m256i
diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c

index 99db4e76169b51044ac6a93848359bb0177a8fd1..f3230463bbe6cb09d30d58c516ee52dcf0837c4e 100644 (file)
--- a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
@@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVcN4v_sin)
  VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log)
  VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp)
  VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow)
+VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVcN4v_acos)
  
  #define VEC_INT_TYPE __m128i
  
diff --git a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c

index 251d429ac00b8d6eec400aa5379f1791cd525729..cf9f52faf08c4bbaa8e28521a3ab709134b6f253 100644 (file)
--- a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
@@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVeN8v_sin)
  VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log)
  VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp)
  VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow)
+VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVeN8v_acos)
  
  #ifndef __ILP32__
  # define VEC_INT_TYPE __m512i
diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c

new file mode 100644 (file)

index 0000000..1e6474d
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-acosf.c"
diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c

new file mode 100644 (file)

index 0000000..1e6474d
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-acosf.c"
diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c

new file mode 100644 (file)

index 0000000..1e6474d
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-acosf.c"
diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c

new file mode 100644 (file)

index 0000000..fb47f97
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c
@@ -0,0 +1,3 @@
+#define LIBMVEC_TYPE float
+#define LIBMVEC_FUNC acosf
+#include "test-vector-abi-arg1.h"
diff --git a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c

index c1d14cd79e91fd48c70fe3ad40cf60ec64a83c98..abbd3ed870104cdf39ecd91e015ed977e4f5a59b 100644 (file)
--- a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
@@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVeN16v_sinf)
  VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVeN16v_logf)
  VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf)
  VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf)
+VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVeN16v_acosf)
  
  #define VEC_INT_TYPE __m512i
  
diff --git a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c

index d23c37206024a645f35b7a7b58750cb182f5eb87..8a240279529fdc4e457901737927357a4d9b6450 100644 (file)
--- a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
@@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf)
  VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf)
  VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf)
  VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf)
+VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVbN4v_acosf)
  
  #define VEC_INT_TYPE __m128i
  
diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c

index 3152cffb0c17e076ec7e7d81c8a8e5b8fa775988..aff0442606fdf8eb77d085f00adc92a4be68ad38 100644 (file)
--- a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
@@ -30,6 +30,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVdN8v_sinf)
  VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVdN8v_logf)
  VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf)
  VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf)
+VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVdN8v_acosf)
  
  /* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf.  */
  #undef VECTOR_WRAPPER_fFF
diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c

index a8492abfef1f7e392fe0e2f86af79cc7a1eba5b6..913584d111bde1f125361fb6ce669e74053f947c 100644 (file)
--- a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
@@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVcN8v_sinf)
  VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVcN8v_logf)
  VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf)
  VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf)
+VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVcN8v_acosf)
  
  #define VEC_INT_TYPE __m128i
author	Sunil K Pandey <skpgkp2@gmail.com>
	Wed, 22 Dec 2021 14:20:41 +0000 (06:20 -0800)
committer	Sunil K Pandey <skpgkp2@gmail.com>
	Wed, 22 Dec 2021 21:03:14 +0000 (13:03 -0800)
bits/libm-simd-decl-stubs.h		patch \| blob \| blame \| history
math/bits/mathcalls.h		patch \| blob \| blame \| history
sysdeps/unix/sysv/linux/x86_64/libmvec.abilist		patch \| blob \| blame \| history
sysdeps/x86/fpu/bits/math-vector.h		patch \| blob \| blame \| history
sysdeps/x86/fpu/finclude/math-vector-fortran.h		patch \| blob \| blame \| history
sysdeps/x86_64/fpu/Makeconfig		patch \| blob \| blame \| history
sysdeps/x86_64/fpu/Versions		patch \| blob \| blame \| history
sysdeps/x86_64/fpu/libm-test-ulps		patch \| blob \| blame \| history
sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/svml_d_acos2_core.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/svml_d_acos4_core.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/svml_d_acos8_core.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/svml_s_acosf16_core.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/svml_s_acosf4_core.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/svml_s_acosf8_core.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/test-double-libmvec-acos.c	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c		patch \| blob \| blame \| history
sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c		patch \| blob \| blame \| history
sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c		patch \| blob \| blame \| history
sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c		patch \| blob \| blame \| history
sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/test-float-libmvec-acosf.c	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c		patch \| blob \| blame \| history
sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c		patch \| blob \| blame \| history
sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c		patch \| blob \| blame \| history
sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c		patch \| blob \| blame \| history