1 /* Function acosh vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Compute acosh(x) as log(x + sqrt(x*x - 1))
23 * using RSQRT instructions for starting the
24 * square root approximation, and small table lookups for log
25 * that map to AVX-512 permute instructions
29 * acosh(NaN) = quiet NaN, and raise invalid exception
32 * acosh(x) = NaN if x < 1
37 /* Offsets for data table __svml_dacosh_data_internal_avx512
42 #define SmallThreshold 320
44 #define LargeThreshold 448
52 #define RcpBitMask 960
53 #define OneEighth 1024
55 #define poly_coeff9 1152
56 #define poly_coeff8 1216
57 #define poly_coeff7 1280
58 #define poly_coeff6 1344
59 #define poly_coeff5 1408
60 #define poly_coeff4 1472
61 #define poly_coeff3 1536
62 #define poly_coeff2 1600
63 #define poly_coeff1 1664
69 .section .text.evex512, "ax", @progbits
70 ENTRY(_ZGVeN8v_acosh_skx)
72 cfi_def_cfa_offset(16)
78 vmovups One+__svml_dacosh_data_internal_avx512(%rip), %zmm5
80 /* polynomial computation for small inputs */
81 vmovups ca2+__svml_dacosh_data_internal_avx512(%rip), %zmm13
82 vmovups ca1+__svml_dacosh_data_internal_avx512(%rip), %zmm14
85 * sqrt(1+x^2) ~ Sh + Sl + Sh*Eh*poly_s
86 * poly_s = c1+c2*Eh+c3*Eh^2
88 vmovups c4s+__svml_dacosh_data_internal_avx512(%rip), %zmm1
89 vmovups c2s+__svml_dacosh_data_internal_avx512(%rip), %zmm2
90 vmovups c1s+__svml_dacosh_data_internal_avx512(%rip), %zmm6
92 /* very large inputs ? */
93 vmovups Threshold+__svml_dacosh_data_internal_avx512(%rip), %zmm15
95 /* out of range inputs? */
96 vmovups LargeThreshold+__svml_dacosh_data_internal_avx512(%rip), %zmm3
98 /* not a very small input ? */
99 vmovups SmallThreshold+__svml_dacosh_data_internal_avx512(%rip), %zmm10
100 vmovaps %zmm0, %zmm12
103 vmovaps %zmm5, %zmm11
104 vfmsub231pd {rn-sae}, %zmm12, %zmm12, %zmm11
105 vcmppd $21, {sae}, %zmm15, %zmm12, %k2
106 vcmppd $22, {sae}, %zmm3, %zmm12, %k0
107 vcmppd $18, {sae}, %zmm5, %zmm12, %k1
108 vrsqrt14pd %zmm11, %zmm4
109 vcmppd $21, {sae}, %zmm10, %zmm11, %k3
110 vfmadd231pd {rn-sae}, %zmm11, %zmm13, %zmm14
111 vmovups c3s+__svml_dacosh_data_internal_avx512(%rip), %zmm13
113 /* Sh ~sqrt(-1+x^2) */
114 vmulpd {rn-sae}, %zmm4, %zmm11, %zmm9
115 vmulpd {rn-sae}, %zmm11, %zmm14, %zmm8
118 vaddpd {rn-sae}, %zmm12, %zmm9, %zmm15
121 vsubpd {rn-sae}, %zmm12, %zmm15, %zmm14
124 vmovaps %zmm11, %zmm0
127 /* rel. error term: Eh=1-Sh*R0 */
129 vfmsub213pd {rn-sae}, %zmm9, %zmm4, %zmm0
130 vfnmadd231pd {rn-sae}, %zmm9, %zmm4, %zmm7
132 /* rel. error term: Eh=(1-Sh*R0)-Sl*R0 */
133 vfnmadd231pd {rn-sae}, %zmm0, %zmm4, %zmm7
136 vsubpd {rn-sae}, %zmm14, %zmm9, %zmm4
137 vmovups poly_coeff7+__svml_dacosh_data_internal_avx512(%rip), %zmm14
138 vfmadd231pd {rn-sae}, %zmm7, %zmm1, %zmm13
139 vfmadd213pd {rn-sae}, %zmm2, %zmm7, %zmm13
140 vfmadd213pd {rn-sae}, %zmm6, %zmm7, %zmm13
143 vmulpd {rn-sae}, %zmm7, %zmm9, %zmm7
145 /* Sl + Sh*Eh*poly_s */
146 vfmadd213pd {rn-sae}, %zmm0, %zmm13, %zmm7
149 vmovups poly_coeff9+__svml_dacosh_data_internal_avx512(%rip), %zmm13
151 /* polynomial computation for small inputs */
152 vaddpd {rn-sae}, %zmm7, %zmm9, %zmm0
154 /* Xin0+Sl+Sh*Eh*poly_s ~ x+sqrt(1+x^2) */
155 vaddpd {rn-sae}, %zmm7, %zmm15, %zmm6
156 vfmadd231pd {rn-sae}, %zmm0, %zmm8, %zmm0
158 /* fixup for very large inputs */
159 vmovups OneEighth+__svml_dacosh_data_internal_avx512(%rip), %zmm8
162 vsubpd {rn-sae}, %zmm15, %zmm6, %zmm9
163 vmovups poly_coeff6+__svml_dacosh_data_internal_avx512(%rip), %zmm15
164 vmulpd {rn-sae}, %zmm8, %zmm12, %zmm6{%k2}
167 vsubpd {rn-sae}, %zmm9, %zmm7, %zmm3
168 vrcp14pd %zmm6, %zmm1
171 vaddpd {rn-sae}, %zmm4, %zmm3, %zmm7
174 vmovups __svml_dacosh_data_internal_avx512(%rip), %zmm3
176 /* round reciprocal to 1+4b mantissas */
177 vpaddq AddB5+__svml_dacosh_data_internal_avx512(%rip), %zmm1, %zmm2
179 /* fixup for very large inputs */
180 vxorpd %zmm7, %zmm7, %zmm7{%k2}
181 vmovups poly_coeff8+__svml_dacosh_data_internal_avx512(%rip), %zmm1
182 vandpd RcpBitMask+__svml_dacosh_data_internal_avx512(%rip), %zmm2, %zmm8
183 vmovups Log_tbl_L+__svml_dacosh_data_internal_avx512(%rip), %zmm2
185 /* Prepare table index */
186 vpsrlq $48, %zmm8, %zmm9
188 /* reduced argument for log(): (Rcp*Xin-1)+Rcp*Xin_low */
189 vfmsub231pd {rn-sae}, %zmm8, %zmm6, %zmm5
192 vgetexppd {sae}, %zmm8, %zmm4
193 vmovups Four+__svml_dacosh_data_internal_avx512(%rip), %zmm6
194 vpermt2pd Log_tbl_H+64+__svml_dacosh_data_internal_avx512(%rip), %zmm9, %zmm3
195 vpermt2pd Log_tbl_L+64+__svml_dacosh_data_internal_avx512(%rip), %zmm9, %zmm2
196 vsubpd {rn-sae}, %zmm6, %zmm4, %zmm4{%k2}
197 vfmadd231pd {rn-sae}, %zmm8, %zmm7, %zmm5
198 vmovups poly_coeff5+__svml_dacosh_data_internal_avx512(%rip), %zmm6
199 vmovups poly_coeff4+__svml_dacosh_data_internal_avx512(%rip), %zmm7
202 vmovups L2H+__svml_dacosh_data_internal_avx512(%rip), %zmm8
205 vmovups L2L+__svml_dacosh_data_internal_avx512(%rip), %zmm9
206 vfmadd231pd {rn-sae}, %zmm5, %zmm13, %zmm1
207 vmovups poly_coeff2+__svml_dacosh_data_internal_avx512(%rip), %zmm13
208 vfnmadd231pd {rn-sae}, %zmm4, %zmm8, %zmm3
209 vfnmadd213pd {rn-sae}, %zmm2, %zmm9, %zmm4
210 vfmadd213pd {rn-sae}, %zmm14, %zmm5, %zmm1
211 vmovups poly_coeff3+__svml_dacosh_data_internal_avx512(%rip), %zmm2
212 vmovups poly_coeff1+__svml_dacosh_data_internal_avx512(%rip), %zmm14
213 vfmadd213pd {rn-sae}, %zmm15, %zmm5, %zmm1
216 vmulpd {rn-sae}, %zmm5, %zmm5, %zmm15
217 vfmadd213pd {rn-sae}, %zmm6, %zmm5, %zmm1
218 vfmadd213pd {rn-sae}, %zmm7, %zmm5, %zmm1
219 vfmadd213pd {rn-sae}, %zmm2, %zmm5, %zmm1
220 vfmadd213pd {rn-sae}, %zmm13, %zmm5, %zmm1
221 vfmadd213pd {rn-sae}, %zmm14, %zmm5, %zmm1
224 vfmadd213pd {rn-sae}, %zmm4, %zmm15, %zmm1
226 /* R+Tl + R^2*Poly */
227 vaddpd {rn-sae}, %zmm5, %zmm1, %zmm5
228 vaddpd {rn-sae}, %zmm5, %zmm3, %zmm0{%k3}
230 /* Go to special inputs processing branch */
231 jne L(SPECIAL_VALUES_BRANCH)
232 # LOE rbx r12 r13 r14 r15 k0 zmm0 zmm12
235 * and exit the function
251 L(SPECIAL_VALUES_BRANCH):
252 vmovups %zmm12, 64(%rsp)
253 vmovups %zmm0, 128(%rsp)
254 # LOE rbx r12 r13 r14 r15 k0 zmm0
257 # LOE rbx r12 r13 r14 r15 eax k0
261 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
262 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
265 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
266 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
269 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
270 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
271 # LOE rbx r15 r12d r13d
280 /* Call scalar math function */
281 jc L(SCALAR_MATH_CALL)
282 # LOE rbx r15 r12d r13d
288 L(SPECIAL_VALUES_LOOP):
292 /* Check bits in range mask */
293 jl L(RANGEMASK_CHECK)
294 # LOE rbx r15 r12d r13d
302 vmovups 128(%rsp), %zmm0
306 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
307 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
308 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
309 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
310 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
311 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
312 # LOE rbx r12 r13 r14 r15 zmm0
314 /* Scalar math function call
315 * to process special input
320 vmovsd 64(%rsp, %r14, 8), %xmm0
322 # LOE rbx r14 r15 r12d r13d xmm0
324 vmovsd %xmm0, 128(%rsp, %r14, 8)
326 /* Process special inputs in loop */
327 jmp L(SPECIAL_VALUES_LOOP)
328 # LOE rbx r15 r12d r13d
329 END(_ZGVeN8v_acosh_skx)
331 .section .rodata, "a"
334 #ifdef __svml_dacosh_data_internal_avx512_typedef
335 typedef unsigned int VUINT32;
337 __declspec(align(64)) VUINT32 Log_tbl_H[16][2];
338 __declspec(align(64)) VUINT32 Log_tbl_L[16][2];
339 __declspec(align(64)) VUINT32 One[8][2];
340 __declspec(align(64)) VUINT32 SmallThreshold[8][2];
341 __declspec(align(64)) VUINT32 Threshold[8][2];
342 __declspec(align(64)) VUINT32 LargeThreshold[8][2];
343 __declspec(align(64)) VUINT32 ca2[8][2];
344 __declspec(align(64)) VUINT32 ca1[8][2];
345 __declspec(align(64)) VUINT32 c4s[8][2];
346 __declspec(align(64)) VUINT32 c3s[8][2];
347 __declspec(align(64)) VUINT32 c2s[8][2];
348 __declspec(align(64)) VUINT32 c1s[8][2];
349 __declspec(align(64)) VUINT32 AddB5[8][2];
350 __declspec(align(64)) VUINT32 RcpBitMask[8][2];
351 __declspec(align(64)) VUINT32 OneEighth[8][2];
352 __declspec(align(64)) VUINT32 Four[8][2];
353 __declspec(align(64)) VUINT32 poly_coeff9[8][2];
354 __declspec(align(64)) VUINT32 poly_coeff8[8][2];
355 __declspec(align(64)) VUINT32 poly_coeff7[8][2];
356 __declspec(align(64)) VUINT32 poly_coeff6[8][2];
357 __declspec(align(64)) VUINT32 poly_coeff5[8][2];
358 __declspec(align(64)) VUINT32 poly_coeff4[8][2];
359 __declspec(align(64)) VUINT32 poly_coeff3[8][2];
360 __declspec(align(64)) VUINT32 poly_coeff2[8][2];
361 __declspec(align(64)) VUINT32 poly_coeff1[8][2];
362 __declspec(align(64)) VUINT32 L2H[8][2];
363 __declspec(align(64)) VUINT32 L2L[8][2];
364 } __svml_dacosh_data_internal_avx512;
366 __svml_dacosh_data_internal_avx512:
368 .quad 0x0000000000000000
369 .quad 0xbfaf0a30c0120000
370 .quad 0xbfbe27076e2b0000
371 .quad 0xbfc5ff3070a78000
372 .quad 0xbfcc8ff7c79a8000
373 .quad 0xbfd1675cababc000
374 .quad 0xbfd4618bc21c4000
375 .quad 0xbfd739d7f6bbc000
376 .quad 0xbfd9f323ecbf8000
377 .quad 0xbfdc8ff7c79a8000
378 .quad 0xbfdf128f5faf0000
379 .quad 0xbfe0be72e4252000
380 .quad 0xbfe1e85f5e704000
381 .quad 0xbfe307d7334f2000
382 .quad 0xbfe41d8fe8468000
383 .quad 0xbfe52a2d265bc000
386 .quad 0x0000000000000000
387 .quad 0x3d53ab33d066d1d2
388 .quad 0x3d2a342c2af0003c
389 .quad 0xbd43d3c873e20a07
390 .quad 0xbd4a21ac25d81ef3
391 .quad 0x3d59f1fc63382a8f
392 .quad 0xbd5ec27d0b7b37b3
393 .quad 0xbd50069ce24c53fb
394 .quad 0xbd584bf2b68d766f
395 .quad 0xbd5a21ac25d81ef3
396 .quad 0xbd3bb2cd720ec44c
397 .quad 0xbd55056d312f7668
398 .quad 0xbd1a07bd8b34be7c
399 .quad 0x3d5e83c094debc15
400 .quad 0x3d5aa33736867a17
401 .quad 0xbd46abb9df22bc57
404 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
407 .quad 0x3ef0000000000000, 0x3ef0000000000000, 0x3ef0000000000000, 0x3ef0000000000000, 0x3ef0000000000000, 0x3ef0000000000000, 0x3ef0000000000000, 0x3ef0000000000000
410 .quad 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000
413 .quad 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff
416 .quad 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7
419 .quad 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e
422 .quad 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612
425 .quad 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000
428 .quad 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000
431 .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
434 .quad 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000
437 .quad 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000
440 .quad 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000
443 .quad 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000
446 .quad 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368
449 .quad 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778
452 .quad 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9
455 .quad 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1
458 .quad 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736
461 .quad 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af
464 .quad 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65
467 .quad 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1
470 .quad 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000
471 /* L2H = log(2)_high */
473 .quad 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000
474 /* L2L = log(2)_low */
476 .quad 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000
478 .type __svml_dacosh_data_internal_avx512, @object
479 .size __svml_dacosh_data_internal_avx512, .-__svml_dacosh_data_internal_avx512