1 /* Function coshf vectorized with AVX-512.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Compute cosh(x) as (exp(x)+exp(-x))/2,
23 * where exp is calculated as
24 * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
28 * cosh(NaN) = quiet NaN, and raise invalid exception
29 * cosh(INF) = that INF
31 * cosh(x) overflows for big x and returns MAXLOG+log(2)
35 /* Offsets for data table __svml_scosh_data_internal
37 #define _sExp_tbl_PH 0
38 #define _sExp_tbl_NH 128
39 #define _sShifter_UISA 256
40 #define _iDomainRange_UISA 320
41 #define _sPC1_UISA 384
42 #define _sPC2_UISA 448
43 #define _sPC3_UISA 512
50 #define _iDomainRange 960
57 .section .text.evex512, "ax", @progbits
58 ENTRY(_ZGVeN16v_coshf_skx)
60 cfi_def_cfa_offset(16)
66 vmovups _sSign+__svml_scosh_data_internal(%rip), %zmm4
67 vmovups _sShifter_UISA+__svml_scosh_data_internal(%rip), %zmm6
71 * dM = x/log(2) + RShifter
73 vmovups _sInvLn2+__svml_scosh_data_internal(%rip), %zmm10
74 vmovups _sLn2hi+__svml_scosh_data_internal(%rip), %zmm7
75 vmovups _sLn2lo+__svml_scosh_data_internal(%rip), %zmm9
78 vmovups _sPC3_UISA+__svml_scosh_data_internal(%rip), %zmm2
81 vmovups _sPC2_UISA+__svml_scosh_data_internal(%rip), %zmm3
83 /* G1, G2 2^N, 2^(-N) */
84 vmovups __svml_scosh_data_internal(%rip), %zmm12
85 vmovups _sExp_tbl_NH+__svml_scosh_data_internal(%rip), %zmm13
91 vandnps %zmm0, %zmm4, %zmm1
93 /* Check for overflow\underflow */
94 vpternlogd $255, %zmm5, %zmm5, %zmm5
95 vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm10
96 vpcmpd $1, _iDomainRange_UISA+__svml_scosh_data_internal(%rip), %zmm1, %k1
98 /* iM now is an EXP(2^N) */
99 vpslld $18, %zmm10, %zmm11
105 vsubps {rn-sae}, %zmm6, %zmm10, %zmm8
106 vpermt2ps _sExp_tbl_PH+64+__svml_scosh_data_internal(%rip), %zmm10, %zmm12
107 vpermt2ps _sExp_tbl_NH+64+__svml_scosh_data_internal(%rip), %zmm10, %zmm13
108 vpandnd %zmm1, %zmm1, %zmm5{%k1}
110 /* sR = sX - sN*Log2_hi */
111 vfnmadd231ps {rn-sae}, %zmm7, %zmm8, %zmm1
112 vptestmd %zmm5, %zmm5, %k0
114 /* sR = (sX - sN*Log2_hi) - sN*Log2_lo */
115 vfnmadd231ps {rn-sae}, %zmm9, %zmm8, %zmm1
117 vmulps {rn-sae}, %zmm1, %zmm1, %zmm4
118 vmulps {rn-sae}, %zmm4, %zmm2, %zmm2
120 /* sSinh_r = r + r*(r^2*(a3)) */
121 vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm2
123 /* sOut = r^2*(a2) */
124 vmulps {rn-sae}, %zmm4, %zmm3, %zmm1
125 vpandd _iExpMask+__svml_scosh_data_internal(%rip), %zmm11, %zmm14
126 vpaddd %zmm14, %zmm12, %zmm15
127 vpsubd %zmm14, %zmm13, %zmm10
129 /* sG2 = 2^N*Th + 2^(-N)*T_h */
130 vaddps {rn-sae}, %zmm10, %zmm15, %zmm5
132 /* sG1 = 2^N*Th - 2^(-N)*T_h */
133 vsubps {rn-sae}, %zmm10, %zmm15, %zmm6
135 /* res = sG1*(r + r*(r^2*(a3))) + sG2*(1+r^2*(a2)) */
136 vfmadd213ps {rn-sae}, %zmm5, %zmm5, %zmm1
137 vfmadd213ps {rn-sae}, %zmm1, %zmm2, %zmm6
140 /* Go to special inputs processing branch */
141 jne L(SPECIAL_VALUES_BRANCH)
142 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm6
145 * and exit the function
162 L(SPECIAL_VALUES_BRANCH):
163 vmovups %zmm0, 64(%rsp)
164 vmovups %zmm6, 128(%rsp)
165 # LOE rbx r12 r13 r14 r15 edx zmm6
168 # LOE rbx r12 r13 r14 r15 eax edx
172 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
173 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
176 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
177 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
180 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
181 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
182 # LOE rbx r15 r12d r13d
191 /* Call scalar math function */
192 jc L(SCALAR_MATH_CALL)
193 # LOE rbx r15 r12d r13d
199 L(SPECIAL_VALUES_LOOP):
203 /* Check bits in range mask */
204 jl L(RANGEMASK_CHECK)
205 # LOE rbx r15 r12d r13d
213 vmovups 128(%rsp), %zmm6
217 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
218 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
219 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
220 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
221 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
222 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
223 # LOE rbx r12 r13 r14 r15 zmm6
225 /* Scalar math fucntion call
226 * to process special input
231 vmovss 64(%rsp, %r14, 4), %xmm0
233 # LOE rbx r14 r15 r12d r13d xmm0
235 vmovss %xmm0, 128(%rsp, %r14, 4)
237 /* Process special inputs in loop */
238 jmp L(SPECIAL_VALUES_LOOP)
239 # LOE rbx r15 r12d r13d
240 END(_ZGVeN16v_coshf_skx)
242 .section .rodata, "a"
245 #ifdef __svml_scosh_data_internal_typedef
246 typedef unsigned int VUINT32;
248 __declspec(align(64)) VUINT32 _sExp_tbl_PH[32][1];
249 __declspec(align(64)) VUINT32 _sExp_tbl_NH[32][1];
250 __declspec(align(64)) VUINT32 _sShifter_UISA[16][1];
251 __declspec(align(64)) VUINT32 _iDomainRange_UISA[16][1];
252 __declspec(align(64)) VUINT32 _sPC1_UISA[16][1];
253 __declspec(align(64)) VUINT32 _sPC2_UISA[16][1];
254 __declspec(align(64)) VUINT32 _sPC3_UISA[16][1];
255 __declspec(align(64)) VUINT32 _sInvLn2[16][1];
256 __declspec(align(64)) VUINT32 _sLn2hi[16][1];
257 __declspec(align(64)) VUINT32 _sLn2lo[16][1];
258 __declspec(align(64)) VUINT32 _sSign[16][1];
259 __declspec(align(64)) VUINT32 _iExpMask[16][1];
260 __declspec(align(64)) VUINT32 _sShifter[16][1];
261 __declspec(align(64)) VUINT32 _iDomainRange[16][1];
262 __declspec(align(64)) VUINT32 _sPC1[16][1];
263 __declspec(align(64)) VUINT32 _sPC2[16][1];
264 __declspec(align(64)) VUINT32 _sPC3[16][1];
265 } __svml_scosh_data_internal;
267 __svml_scosh_data_internal:
268 /* _sExp_tbl_PH 2^(i/32-1), i=0..31 */
269 .long 0x3f000000, 0x3f02cd87, 0x3f05aac3, 0x3f08980f
270 .long 0x3f0b95c2, 0x3f0ea43a, 0x3f11c3d3, 0x3f14f4f0
271 .long 0x3f1837f0, 0x3f1b8d3a, 0x3f1ef532, 0x3f227043
272 .long 0x3f25fed7, 0x3f29a15b, 0x3f2d583f, 0x3f3123f6
273 .long 0x3f3504f3, 0x3f38fbaf, 0x3f3d08a4, 0x3f412c4d
274 .long 0x3f45672a, 0x3f49b9be, 0x3f4e248c, 0x3f52a81e
275 .long 0x3f5744fd, 0x3f5bfbb8, 0x3f60ccdf, 0x3f65b907
276 .long 0x3f6ac0c7, 0x3f6fe4ba, 0x3f75257d, 0x3f7a83b3
277 /* _sExp_tbl_NH 2^(-i/32-1), i=0..31 */
279 .long 0x3f000000, 0x3efa83b3, 0x3ef5257d, 0x3eefe4ba
280 .long 0x3eeac0c7, 0x3ee5b907, 0x3ee0ccdf, 0x3edbfbb8
281 .long 0x3ed744fd, 0x3ed2a81e, 0x3ece248c, 0x3ec9b9be
282 .long 0x3ec5672a, 0x3ec12c4d, 0x3ebd08a4, 0x3eb8fbaf
283 .long 0x3eb504f3, 0x3eb123f6, 0x3ead583f, 0x3ea9a15b
284 .long 0x3ea5fed7, 0x3ea27043, 0x3e9ef532, 0x3e9b8d3a
285 .long 0x3e9837f0, 0x3e94f4f0, 0x3e91c3d3, 0x3e8ea43a
286 .long 0x3e8b95c2, 0x3e88980f, 0x3e85aac3, 0x3e82cd87
288 .long 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000 /* 1.5*2^18 _sShifter_UISA */
290 .long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange_UISA */
292 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1_UISA=1 */
294 .long 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f /* _sPC2_UISA */
296 .long 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd /* _sPC3_UISA */
298 .long 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ // k=0
300 .long 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */
302 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 /* _sLn2lo */
304 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */
306 .long 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 /* _iExpMask */
308 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
310 .long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */
312 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */
314 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */
316 .long 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */
318 .type __svml_scosh_data_internal, @object
319 .size __svml_scosh_data_internal, .-__svml_scosh_data_internal