1 /* Function coshf vectorized with AVX2.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Compute cosh(x) as (exp(x)+exp(-x))/2,
23 * where exp is calculated as
24 * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
28 * cosh(NaN) = quiet NaN, and raise invalid exception
29 * cosh(INF) = that INF
31 * cosh(x) overflows for big x and returns MAXLOG+log(2)
35 /* Offsets for data table __svml_scosh_data_internal
42 #define _iDomainRange 160
54 .section .text.avx2,"ax",@progbits
55 ENTRY(_ZGVdN8v_coshf_avx2)
57 cfi_def_cfa_offset(16)
63 vmovups _sSign+__svml_scosh_data_internal(%rip), %ymm2
64 vmovups _sShifter+__svml_scosh_data_internal(%rip), %ymm7
68 * dM = x/log(2) + RShifter
70 vmovups _sInvLn2+__svml_scosh_data_internal(%rip), %ymm10
71 vmovups _sLn2hi+__svml_scosh_data_internal(%rip), %ymm8
72 vmovups _iDomainRange+__svml_scosh_data_internal(%rip), %ymm3
75 * sinh(r) = r*((a1=1)+r^2*(a3+r^2*(a5+{v1 r^2*a7})))) = r + r*(r^2*(a3+r^2*(a5+r^2*a7))) ....
76 * sSinh_r = (a3+r^2*a5)
78 vmovups _sPC5+__svml_scosh_data_internal(%rip), %ymm15
79 vmovups _iHalf+__svml_scosh_data_internal(%rip), %ymm11
86 vandnps %ymm1, %ymm2, %ymm0
87 vfmadd213ps %ymm7, %ymm0, %ymm10
93 vsubps %ymm7, %ymm10, %ymm9
97 * iM now is an EXP(2^N)
99 vpslld $23, %ymm10, %ymm12
101 /* Check for overflow\underflow */
102 vpcmpgtd %ymm3, %ymm0, %ymm4
103 vpcmpeqd %ymm3, %ymm0, %ymm5
105 /* sR = sX - sN*Log2_hi */
106 vfnmadd231ps %ymm8, %ymm9, %ymm0
107 vpaddd %ymm12, %ymm11, %ymm13
108 vpsubd %ymm12, %ymm11, %ymm14
109 vpor %ymm5, %ymm4, %ymm6
111 /* sR = (sX - sN*Log2_hi) - sN*Log2_lo */
112 vfnmadd231ps _sLn2lo+__svml_scosh_data_internal(%rip), %ymm9, %ymm0
114 /* sG1 = 2^(N-1)-2^(-N-1) */
115 vsubps %ymm14, %ymm13, %ymm4
117 /* sG2 = 2^(N-1)+2^(-N-1) */
118 vaddps %ymm14, %ymm13, %ymm3
120 /* sR2 = sR^2,shaffled */
121 vmulps %ymm0, %ymm0, %ymm2
122 vfmadd213ps _sPC3+__svml_scosh_data_internal(%rip), %ymm2, %ymm15
124 /* sSinh_r = r^2*(a3+r^2*a5) */
125 vmulps %ymm15, %ymm2, %ymm13
127 /* sSinh_r = r + r*(r^2*(a3+r^2*a5)) */
128 vfmadd213ps %ymm0, %ymm0, %ymm13
131 * sinh(X) = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2)
132 * sOut = (a4 +a6*sR2)
134 vmovups _sPC6+__svml_scosh_data_internal(%rip), %ymm0
135 vfmadd213ps _sPC4+__svml_scosh_data_internal(%rip), %ymm2, %ymm0
137 /* sOut = a2+sR2*(a4+a6*sR2) */
138 vfmadd213ps _sPC2+__svml_scosh_data_internal(%rip), %ymm2, %ymm0
140 /* sOut = sR2*(a2+sR2*(a4+a6*sR2) */
141 vmulps %ymm0, %ymm2, %ymm15
143 /* sOut = sG2*sR2*(a2+sR2*(a4+a6*sR2) */
144 vmulps %ymm15, %ymm3, %ymm14
146 /* sOut = sG1*sinh(dR)+sG2*sR2*(a2+sR2*(a4+a6*sR2) */
147 vfmadd213ps %ymm14, %ymm13, %ymm4
148 vmovmskps %ymm6, %edx
150 /* sOut = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2) */
151 vaddps %ymm4, %ymm3, %ymm0
156 /* Go to special inputs processing branch */
157 jne L(SPECIAL_VALUES_BRANCH)
158 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm1
161 * and exit the function
177 L(SPECIAL_VALUES_BRANCH):
178 vmovups %ymm1, 32(%rsp)
179 vmovups %ymm0, 64(%rsp)
180 # LOE rbx r12 r13 r14 r15 edx ymm0
183 # LOE rbx r12 r13 r14 r15 eax edx
187 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
188 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
191 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
192 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
195 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
196 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
197 # LOE rbx r15 r12d r13d
206 /* Call scalar math function */
207 jc L(SCALAR_MATH_CALL)
208 # LOE rbx r15 r12d r13d
214 L(SPECIAL_VALUES_LOOP):
218 /* Check bits in range mask */
219 jl L(RANGEMASK_CHECK)
220 # LOE rbx r15 r12d r13d
228 vmovups 64(%rsp), %ymm0
232 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
233 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
234 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
235 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
236 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
237 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
238 # LOE rbx r12 r13 r14 r15 ymm0
240 /* Scalar math fucntion call
241 * to process special input
246 movss 32(%rsp,%r14,4), %xmm0
248 # LOE rbx r14 r15 r12d r13d xmm0
250 movss %xmm0, 64(%rsp,%r14,4)
252 /* Process special inputs in loop */
253 jmp L(SPECIAL_VALUES_LOOP)
254 # LOE rbx r15 r12d r13d
255 END(_ZGVdN8v_coshf_avx2)
257 .section .rodata, "a"
260 #ifdef __svml_scosh_data_internal_typedef
261 typedef unsigned int VUINT32;
264 __declspec(align(32)) VUINT32 _sInvLn2[8][1];
265 __declspec(align(32)) VUINT32 _sLn2hi[8][1];
266 __declspec(align(32)) VUINT32 _sLn2lo[8][1];
267 __declspec(align(32)) VUINT32 _sSign[8][1];
268 __declspec(align(32)) VUINT32 _sShifter[8][1];
269 __declspec(align(32)) VUINT32 _iDomainRange[8][1];
270 __declspec(align(32)) VUINT32 _sPC1[8][1];
271 __declspec(align(32)) VUINT32 _sPC2[8][1];
272 __declspec(align(32)) VUINT32 _sPC3[8][1];
273 __declspec(align(32)) VUINT32 _sPC4[8][1];
274 __declspec(align(32)) VUINT32 _sPC5[8][1];
275 __declspec(align(32)) VUINT32 _sPC6[8][1];
276 __declspec(align(32)) VUINT32 _iHalf[8][1];
277 } __svml_scosh_data_internal;
279 __svml_scosh_data_internal:
280 .long 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ //k=0
282 .long 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */
284 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 /* _sLn2lo */
286 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */
288 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
290 .long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */
292 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */
294 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */
296 .long 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */
298 .long 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72 /* _sPC4 */
300 .long 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461 /* _sPC5 */
302 .long 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3 /* _sPC6 */
305 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _iHalf*/
307 .type __svml_scosh_data_internal,@object
308 .size __svml_scosh_data_internal,.-__svml_scosh_data_internal