1 /* Function asinhf vectorized with AVX2.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Compute asinh(x) as log(x + sqrt(x*x + 1))
26 * asinh(NaN) = quiet NaN, and raise invalid exception
27 * asinh(INF) = that INF
32 /* Offsets for data table __svml_sasinh_data_internal
38 #define iOffExpoMask 352
39 #define sBigThreshold 384
43 #define sLargestFinite 512
44 #define sLittleThreshold 544
46 #define sThirtyOne 608
53 .section .text.avx2, "ax", @progbits
54 ENTRY(_ZGVdN8v_asinhf_avx2)
56 cfi_def_cfa_offset(16)
64 /* Load the constant 1 and a sign mask */
65 vmovups sOne+__svml_sasinh_data_internal(%rip), %ymm8
67 /* No need to split X when FMA is available in hardware. */
68 vmulps %ymm9, %ymm9, %ymm5
69 vmovups sTopMask8+__svml_sasinh_data_internal(%rip), %ymm1
72 * Finally, express Y + W = X^2 + 1 accurately where Y has <= 8 bits.
73 * If |X| <= 1 then |XHi| <= 1 and so |X2Hi| <= 1, so we can treat 1
74 * as the dominant component in the compensated summation. Otherwise,
75 * if |X| >= 1, then since X2Hi only has 22 significant bits, the basic
76 * addition will be exact anyway until we get to |X| >= 2^24. But by
77 * that time the log function is well-conditioned enough that the
78 * rounding error doesn't matter. Hence we can treat 1 as dominant even
79 * if it literally isn't.
81 vaddps %ymm5, %ymm8, %ymm13
82 vandps %ymm1, %ymm13, %ymm2
84 vsubps %ymm13, %ymm8, %ymm11
85 vsubps %ymm2, %ymm13, %ymm15
88 * Compute R = 1/sqrt(Y + W) * (1 + d)
89 * Force R to <= 8 significant bits.
90 * This means that R * Y and R^2 * Y are exactly representable.
93 vfmsub213ps %ymm5, %ymm9, %ymm4
94 vaddps %ymm11, %ymm5, %ymm12
97 * Get the absolute value of the input, since we will exploit antisymmetry
98 * and mostly assume X >= 0 in the core computation
100 vandps SgnMask+__svml_sasinh_data_internal(%rip), %ymm9, %ymm6
103 * Check whether the input is finite, by checking |X| <= MaxFloat
104 * Otherwise set the rangemask so that the callout will get used.
105 * Note that this will also use the callout for NaNs since not(NaN <= MaxFloat)
107 vcmpnle_uqps sLargestFinite+__svml_sasinh_data_internal(%rip), %ymm6, %ymm10
108 vaddps %ymm12, %ymm4, %ymm14
111 * Unfortunately, we can still be in trouble if |X| <= 2^-5, since
112 * the absolute error 2^-(7+24)-ish in sqrt(1 + X^2) gets scaled up
113 * by 1/X and comes close to our threshold. Hence if |X| <= 2^-4,
114 * perform an alternative computation
115 * sqrt(1 + X^2) - 1 = X^2/2 - X^4/8 + X^6/16
118 vaddps %ymm4, %ymm5, %ymm4
121 * The following computation can go wrong for very large X, basically
122 * because X^2 overflows. But for large X we have
123 * asinh(X) / log(2 X) - 1 =~= 1/(4 * X^2), so for X >= 2^30
124 * we can just later stick X back into the log and tweak up the exponent.
125 * Actually we scale X by 2^-30 and tweak the exponent up by 31,
126 * to stay in the safe range for the later log computation.
127 * Compute a flag now telling us when do do this.
129 vcmplt_oqps sBigThreshold+__svml_sasinh_data_internal(%rip), %ymm6, %ymm7
130 vaddps %ymm15, %ymm14, %ymm3
134 * = 1 / (1 + (sqrt(1 - e) - 1))
136 * = 1 + 1/2 * e + 3/8 * e^2 + 5/16 * e^3 + 35/128 * e^4 + ...
137 * So compute the first three nonconstant terms of that, so that
138 * we have a relative correction (1 + Corr) to apply to S etc.
143 vmovups sC3+__svml_sasinh_data_internal(%rip), %ymm12
144 vmovmskps %ymm10, %edx
145 vandps %ymm1, %ymm0, %ymm10
148 * Compute S = (Y/sqrt(Y + W)) * (1 + d)
149 * and T = (W/sqrt(Y + W)) * (1 + d)
150 * so that S + T = sqrt(Y + W) * (1 + d)
151 * S is exact, and the rounding error in T is OK.
153 vmulps %ymm10, %ymm2, %ymm15
154 vmulps %ymm3, %ymm10, %ymm14
155 vmovups sHalf+__svml_sasinh_data_internal(%rip), %ymm3
156 vsubps %ymm8, %ymm15, %ymm0
159 * Obtain sqrt(1 + X^2) - 1 in two pieces
162 * = (S + T) * (1 + Corr) - 1
163 * = [S - 1] + [T + (S + T) * Corr]
164 * We need a compensated summation for the last part. We treat S - 1
165 * as the larger part; it certainly is until about X < 2^-4, and in that
166 * case, the error is affordable since X dominates over sqrt(1 + X^2) - 1
167 * Final sum is dTmp5 (hi) + dTmp7 (lo)
169 vaddps %ymm14, %ymm15, %ymm13
172 * Compute e = -(2 * d + d^2)
173 * The first FMR is exact, and the rounding error in the other is acceptable
174 * since d and e are ~ 2^-8
176 vmovaps %ymm8, %ymm11
177 vfnmadd231ps %ymm15, %ymm10, %ymm11
178 vfnmadd231ps %ymm14, %ymm10, %ymm11
179 vfmadd213ps sC2+__svml_sasinh_data_internal(%rip), %ymm11, %ymm12
180 vfmadd213ps %ymm3, %ymm11, %ymm12
181 vmulps %ymm12, %ymm11, %ymm1
183 /* Now multiplex the two possible computations */
184 vcmple_oqps sLittleThreshold+__svml_sasinh_data_internal(%rip), %ymm6, %ymm11
185 vfmadd213ps %ymm14, %ymm13, %ymm1
186 vaddps %ymm0, %ymm1, %ymm2
187 vsubps %ymm2, %ymm0, %ymm10
189 /* sX2over2 = X^2/2 */
190 vmulps %ymm4, %ymm3, %ymm0
191 vaddps %ymm10, %ymm1, %ymm1
193 /* sX4over4 = X^4/4 */
194 vmulps %ymm0, %ymm0, %ymm5
196 /* sX46 = -X^4/4 + X^6/8 */
197 vfmsub231ps %ymm0, %ymm5, %ymm5
199 /* sX46over2 = -X^4/8 + x^6/16 */
200 vmulps %ymm5, %ymm3, %ymm3
201 vaddps %ymm3, %ymm0, %ymm5
202 vblendvps %ymm11, %ymm5, %ymm2, %ymm2
203 vsubps %ymm5, %ymm0, %ymm4
206 * Now do another compensated sum to add |X| + [sqrt(1 + X^2) - 1].
207 * It's always safe to assume |X| is larger.
208 * This is the final 2-part argument to the log1p function
210 vaddps %ymm2, %ymm6, %ymm14
213 * Now resume the main code.
214 * reduction: compute r, n
216 vmovups iBrkValue+__svml_sasinh_data_internal(%rip), %ymm5
217 vaddps %ymm4, %ymm3, %ymm10
220 * Now we feed into the log1p code, using H in place of _VARG1 and
221 * also adding L into Xl.
222 * compute 1+x as high, low parts
224 vmaxps %ymm14, %ymm8, %ymm15
225 vminps %ymm14, %ymm8, %ymm0
226 vblendvps %ymm11, %ymm10, %ymm1, %ymm12
227 vsubps %ymm14, %ymm6, %ymm1
228 vaddps %ymm0, %ymm15, %ymm3
230 /* Now multiplex to the case X = 2^-30 * input, Xl = sL = 0 in the "big" case. */
231 vmulps XScale+__svml_sasinh_data_internal(%rip), %ymm6, %ymm6
232 vaddps %ymm1, %ymm2, %ymm13
233 vsubps %ymm3, %ymm15, %ymm15
234 vaddps %ymm13, %ymm12, %ymm1
235 vaddps %ymm15, %ymm0, %ymm2
236 vblendvps %ymm7, %ymm3, %ymm6, %ymm0
237 vaddps %ymm2, %ymm1, %ymm4
238 vpsubd %ymm5, %ymm0, %ymm1
239 vpsrad $23, %ymm1, %ymm6
240 vpand iOffExpoMask+__svml_sasinh_data_internal(%rip), %ymm1, %ymm2
241 vmovups sPoly+224+__svml_sasinh_data_internal(%rip), %ymm1
242 vpslld $23, %ymm6, %ymm10
243 vpaddd %ymm5, %ymm2, %ymm13
244 vcvtdq2ps %ymm6, %ymm0
245 vpsubd %ymm10, %ymm8, %ymm12
247 /* polynomial evaluation */
248 vsubps %ymm8, %ymm13, %ymm8
250 /* Add 31 to the exponent in the "large" case to get log(2 * input) */
251 vaddps sThirtyOne+__svml_sasinh_data_internal(%rip), %ymm0, %ymm3
252 vandps %ymm7, %ymm4, %ymm11
253 vmulps %ymm12, %ymm11, %ymm14
254 vblendvps %ymm7, %ymm0, %ymm3, %ymm0
255 vaddps %ymm8, %ymm14, %ymm2
256 vfmadd213ps sPoly+192+__svml_sasinh_data_internal(%rip), %ymm2, %ymm1
257 vfmadd213ps sPoly+160+__svml_sasinh_data_internal(%rip), %ymm2, %ymm1
258 vfmadd213ps sPoly+128+__svml_sasinh_data_internal(%rip), %ymm2, %ymm1
259 vfmadd213ps sPoly+96+__svml_sasinh_data_internal(%rip), %ymm2, %ymm1
260 vfmadd213ps sPoly+64+__svml_sasinh_data_internal(%rip), %ymm2, %ymm1
261 vfmadd213ps sPoly+32+__svml_sasinh_data_internal(%rip), %ymm2, %ymm1
262 vfmadd213ps sPoly+__svml_sasinh_data_internal(%rip), %ymm2, %ymm1
263 vmulps %ymm1, %ymm2, %ymm4
264 vfmadd213ps %ymm2, %ymm2, %ymm4
266 /* final reconstruction */
267 vfmadd132ps sLn2+__svml_sasinh_data_internal(%rip), %ymm4, %ymm0
269 /* Finally, reincorporate the original sign. */
270 vandps sSign+__svml_sasinh_data_internal(%rip), %ymm9, %ymm7
271 vxorps %ymm0, %ymm7, %ymm0
274 /* Go to special inputs processing branch */
275 jne L(SPECIAL_VALUES_BRANCH)
276 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm9
279 * and exit the function
295 L(SPECIAL_VALUES_BRANCH):
296 vmovups %ymm9, 32(%rsp)
297 vmovups %ymm0, 64(%rsp)
298 # LOE rbx r12 r13 r14 r15 edx ymm0
301 # LOE rbx r12 r13 r14 r15 eax edx
305 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
306 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
309 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
310 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
313 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
314 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
315 # LOE rbx r15 r12d r13d
324 /* Call scalar math function */
325 jc L(SCALAR_MATH_CALL)
326 # LOE rbx r15 r12d r13d
332 L(SPECIAL_VALUES_LOOP):
336 /* Check bits in range mask */
337 jl L(RANGEMASK_CHECK)
338 # LOE rbx r15 r12d r13d
346 vmovups 64(%rsp), %ymm0
350 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
351 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
352 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
353 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
354 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
355 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
356 # LOE rbx r12 r13 r14 r15 ymm0
358 /* Scalar math function call
359 * to process special input
364 vmovss 32(%rsp, %r14, 4), %xmm0
366 # LOE rbx r14 r15 r12d r13d xmm0
368 vmovss %xmm0, 64(%rsp, %r14, 4)
370 /* Process special inputs in loop */
371 jmp L(SPECIAL_VALUES_LOOP)
372 # LOE rbx r15 r12d r13d
373 END(_ZGVdN8v_asinhf_avx2)
375 .section .rodata, "a"
378 #ifdef __svml_sasinh_data_internal_typedef
379 typedef unsigned int VUINT32;
381 __declspec(align(32)) VUINT32 SgnMask[8][1];
382 __declspec(align(32)) VUINT32 sOne[8][1];
383 __declspec(align(32)) VUINT32 sPoly[8][8][1];
384 __declspec(align(32)) VUINT32 iBrkValue[8][1];
385 __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
386 __declspec(align(32)) VUINT32 sBigThreshold[8][1];
387 __declspec(align(32)) VUINT32 sC2[8][1];
388 __declspec(align(32)) VUINT32 sC3[8][1];
389 __declspec(align(32)) VUINT32 sHalf[8][1];
390 __declspec(align(32)) VUINT32 sLargestFinite[8][1];
391 __declspec(align(32)) VUINT32 sLittleThreshold[8][1];
392 __declspec(align(32)) VUINT32 sSign[8][1];
393 __declspec(align(32)) VUINT32 sThirtyOne[8][1];
394 __declspec(align(32)) VUINT32 sTopMask8[8][1];
395 __declspec(align(32)) VUINT32 XScale[8][1];
396 __declspec(align(32)) VUINT32 sLn2[8][1];
397 } __svml_sasinh_data_internal;
399 __svml_sasinh_data_internal:
401 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
404 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
405 /* sPoly[] = SP polynomial */
407 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
408 .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
409 .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
410 .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
411 .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
412 .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
413 .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
414 .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
415 /* iBrkValue = SP 2/3 */
417 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
418 /* iOffExpoMask = SP significand mask */
420 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
423 .long 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000
426 .long 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000
429 .long 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000
432 .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
435 .long 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF
436 /* sLittleThreshold */
438 .long 0x3D800000, 0x3D800000, 0x3D800000, 0x3D800000, 0x3D800000, 0x3D800000, 0x3D800000, 0x3D800000
441 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
444 .long 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000
447 .long 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000
450 .long 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000
451 /* sLn2 = SP ln(2) */
453 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
455 .type __svml_sasinh_data_internal, @object
456 .size __svml_sasinh_data_internal, .-__svml_sasinh_data_internal