1 /* Function coshf vectorized with SSE4.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Compute cosh(x) as (exp(x)+exp(-x))/2,
23 * where exp is calculated as
24 * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
28 * cosh(NaN) = quiet NaN, and raise invalid exception
29 * cosh(INF) = that INF
31 * cosh(x) overflows for big x and returns MAXLOG+log(2)
35 /* Offsets for data table __svml_scosh_data_internal
42 #define _iDomainRange 80
54 .section .text.sse4,"ax",@progbits
55 ENTRY(_ZGVbN4v_coshf_sse4)
57 cfi_def_cfa_offset(80)
63 movups _sSign+__svml_scosh_data_internal(%rip), %xmm1
67 * dM = x/log(2) + RShifter
69 movups _sInvLn2+__svml_scosh_data_internal(%rip), %xmm9
73 /* Check for overflow\underflow */
75 movups _sShifter+__svml_scosh_data_internal(%rip), %xmm4
76 movups _sLn2hi+__svml_scosh_data_internal(%rip), %xmm5
87 * iM now is an EXP(2^N)
90 movups _sLn2lo+__svml_scosh_data_internal(%rip), %xmm7
93 /* sR = sX - sN*Log2_hi */
96 /* sR = (sX - sN*Log2_hi) - sN*Log2_lo */
98 movdqu _iDomainRange+__svml_scosh_data_internal(%rip), %xmm2
103 * sinh(r) = r*((a1=1)+r^2*(a3+r^2*(a5+{v1 r^2*a7})))) = r + r*(r^2*(a3+r^2*(a5+r^2*a7))) ....
104 * sSinh_r = (a3+r^2*a5)
106 movups _sPC5+__svml_scosh_data_internal(%rip), %xmm10
110 * sinh(X) = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2)
111 * sOut = (a4 +a6*sR2)
113 movups _sPC6+__svml_scosh_data_internal(%rip), %xmm11
116 movdqu _iHalf+__svml_scosh_data_internal(%rip), %xmm8
119 /* sR2 = sR^2,shaffled */
127 addps _sPC3+__svml_scosh_data_internal(%rip), %xmm10
128 addps _sPC4+__svml_scosh_data_internal(%rip), %xmm11
130 /* sSinh_r = r^2*(a3+r^2*a5) */
133 /* sOut = a2+sR2*(a4+a6*sR2) */
136 /* sSinh_r = r + r*(r^2*(a3+r^2*a5)) */
138 addps _sPC2+__svml_scosh_data_internal(%rip), %xmm11
141 /* sOut = sR2*(a2+sR2*(a4+a6*sR2) */
144 /* sG1 = 2^(N-1)-2^(-N-1) */
147 /* sG2 = 2^(N-1)+2^(-N-1) */
151 /* sOut = sG2*sR2*(a2+sR2*(a4+a6*sR2) */
154 /* sOut = sG1*sinh(dR)+sG2*sR2*(a2+sR2*(a4+a6*sR2) */
158 /* sOut = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2) */
164 /* Go to special inputs processing branch */
165 jne L(SPECIAL_VALUES_BRANCH)
166 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm2
169 * and exit the function
175 cfi_def_cfa_offset(8)
177 cfi_def_cfa_offset(80)
183 L(SPECIAL_VALUES_BRANCH):
184 movups %xmm0, 32(%rsp)
185 movups %xmm2, 48(%rsp)
186 # LOE rbx rbp r12 r13 r14 r15 edx
197 # LOE rbx rbp r15 r12d r13d
206 /* Call scalar math function */
207 jc L(SCALAR_MATH_CALL)
208 # LOE rbx rbp r15 r12d r13d
214 L(SPECIAL_VALUES_LOOP):
218 /* Check bits in range mask */
219 jl L(RANGEMASK_CHECK)
220 # LOE rbx rbp r15 r12d r13d
228 movups 48(%rsp), %xmm2
235 # LOE rbx rbp r12 r13 r14 r15 xmm2
237 /* Scalar math fucntion call
238 * to process special input
243 movss 32(%rsp,%r14,4), %xmm0
245 # LOE rbx rbp r14 r15 r12d r13d xmm0
247 movss %xmm0, 48(%rsp,%r14,4)
249 /* Process special inputs in loop */
250 jmp L(SPECIAL_VALUES_LOOP)
251 # LOE rbx rbp r15 r12d r13d
252 END(_ZGVbN4v_coshf_sse4)
254 .section .rodata, "a"
257 #ifdef __svml_scosh_data_internal_typedef
258 typedef unsigned int VUINT32;
261 __declspec(align(16)) VUINT32 _sInvLn2[4][1];
262 __declspec(align(16)) VUINT32 _sLn2hi[4][1];
263 __declspec(align(16)) VUINT32 _sLn2lo[4][1];
264 __declspec(align(16)) VUINT32 _sSign[4][1];
265 __declspec(align(16)) VUINT32 _sShifter[4][1];
266 __declspec(align(16)) VUINT32 _iDomainRange[4][1];
267 __declspec(align(16)) VUINT32 _sPC1[4][1];
268 __declspec(align(16)) VUINT32 _sPC2[4][1];
269 __declspec(align(16)) VUINT32 _sPC3[4][1];
270 __declspec(align(16)) VUINT32 _sPC4[4][1];
271 __declspec(align(16)) VUINT32 _sPC5[4][1];
272 __declspec(align(16)) VUINT32 _sPC6[4][1];
273 __declspec(align(16)) VUINT32 _iHalf[4][1];
274 } __svml_scosh_data_internal;
276 __svml_scosh_data_internal:
277 .long 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ //k=0
279 .long 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */
281 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 /* _sLn2lo */
283 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */
285 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
287 .long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */
289 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */
291 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */
293 .long 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */
295 .long 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72 /* _sPC4 */
297 .long 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461 /* _sPC5 */
299 .long 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3 /* _sPC6 */
302 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _iHalf*/
304 .type __svml_scosh_data_internal,@object
305 .size __svml_scosh_data_internal,.-__svml_scosh_data_internal