]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S
x86-64: Add vector cosh/coshf implementation to libmvec
[thirdparty/glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_s_coshf8_core_avx2.S
1 /* Function coshf vectorized with AVX2.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19 /*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Compute cosh(x) as (exp(x)+exp(-x))/2,
23 * where exp is calculated as
24 * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
25 *
26 * Special cases:
27 *
28 * cosh(NaN) = quiet NaN, and raise invalid exception
29 * cosh(INF) = that INF
30 * cosh(0) = 1
31 * cosh(x) overflows for big x and returns MAXLOG+log(2)
32 *
33 */
34
35 /* Offsets for data table __svml_scosh_data_internal
36 */
37 #define _sInvLn2 0
38 #define _sLn2hi 32
39 #define _sLn2lo 64
40 #define _sSign 96
41 #define _sShifter 128
42 #define _iDomainRange 160
43 #define _sPC1 192
44 #define _sPC2 224
45 #define _sPC3 256
46 #define _sPC4 288
47 #define _sPC5 320
48 #define _sPC6 352
49 #define _iHalf 384
50
51 #include <sysdep.h>
52
53 .text
54 .section .text.avx2,"ax",@progbits
55 ENTRY(_ZGVdN8v_coshf_avx2)
56 pushq %rbp
57 cfi_def_cfa_offset(16)
58 movq %rsp, %rbp
59 cfi_def_cfa(6, 16)
60 cfi_offset(6, -16)
61 andq $-32, %rsp
62 subq $96, %rsp
63 vmovups _sSign+__svml_scosh_data_internal(%rip), %ymm2
64 vmovups _sShifter+__svml_scosh_data_internal(%rip), %ymm7
65
66 /*
67 * Load argument
68 * dM = x/log(2) + RShifter
69 */
70 vmovups _sInvLn2+__svml_scosh_data_internal(%rip), %ymm10
71 vmovups _sLn2hi+__svml_scosh_data_internal(%rip), %ymm8
72 vmovups _iDomainRange+__svml_scosh_data_internal(%rip), %ymm3
73
74 /*
75 * sinh(r) = r*((a1=1)+r^2*(a3+r^2*(a5+{v1 r^2*a7})))) = r + r*(r^2*(a3+r^2*(a5+r^2*a7))) ....
76 * sSinh_r = (a3+r^2*a5)
77 */
78 vmovups _sPC5+__svml_scosh_data_internal(%rip), %ymm15
79 vmovups _iHalf+__svml_scosh_data_internal(%rip), %ymm11
80 vmovaps %ymm0, %ymm1
81
82 /*
83 * Implementation
84 * Abs argument
85 */
86 vandnps %ymm1, %ymm2, %ymm0
87 vfmadd213ps %ymm7, %ymm0, %ymm10
88
89 /*
90 * R
91 * sN = sM - RShifter
92 */
93 vsubps %ymm7, %ymm10, %ymm9
94
95 /*
96 * G1,G2 2^N,2^(-N)
97 * iM now is an EXP(2^N)
98 */
99 vpslld $23, %ymm10, %ymm12
100
101 /* Check for overflow\underflow */
102 vpcmpgtd %ymm3, %ymm0, %ymm4
103 vpcmpeqd %ymm3, %ymm0, %ymm5
104
105 /* sR = sX - sN*Log2_hi */
106 vfnmadd231ps %ymm8, %ymm9, %ymm0
107 vpaddd %ymm12, %ymm11, %ymm13
108 vpsubd %ymm12, %ymm11, %ymm14
109 vpor %ymm5, %ymm4, %ymm6
110
111 /* sR = (sX - sN*Log2_hi) - sN*Log2_lo */
112 vfnmadd231ps _sLn2lo+__svml_scosh_data_internal(%rip), %ymm9, %ymm0
113
114 /* sG1 = 2^(N-1)-2^(-N-1) */
115 vsubps %ymm14, %ymm13, %ymm4
116
117 /* sG2 = 2^(N-1)+2^(-N-1) */
118 vaddps %ymm14, %ymm13, %ymm3
119
120 /* sR2 = sR^2,shaffled */
121 vmulps %ymm0, %ymm0, %ymm2
122 vfmadd213ps _sPC3+__svml_scosh_data_internal(%rip), %ymm2, %ymm15
123
124 /* sSinh_r = r^2*(a3+r^2*a5) */
125 vmulps %ymm15, %ymm2, %ymm13
126
127 /* sSinh_r = r + r*(r^2*(a3+r^2*a5)) */
128 vfmadd213ps %ymm0, %ymm0, %ymm13
129
130 /*
131 * sinh(X) = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2)
132 * sOut = (a4 +a6*sR2)
133 */
134 vmovups _sPC6+__svml_scosh_data_internal(%rip), %ymm0
135 vfmadd213ps _sPC4+__svml_scosh_data_internal(%rip), %ymm2, %ymm0
136
137 /* sOut = a2+sR2*(a4+a6*sR2) */
138 vfmadd213ps _sPC2+__svml_scosh_data_internal(%rip), %ymm2, %ymm0
139
140 /* sOut = sR2*(a2+sR2*(a4+a6*sR2) */
141 vmulps %ymm0, %ymm2, %ymm15
142
143 /* sOut = sG2*sR2*(a2+sR2*(a4+a6*sR2) */
144 vmulps %ymm15, %ymm3, %ymm14
145
146 /* sOut = sG1*sinh(dR)+sG2*sR2*(a2+sR2*(a4+a6*sR2) */
147 vfmadd213ps %ymm14, %ymm13, %ymm4
148 vmovmskps %ymm6, %edx
149
150 /* sOut = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2) */
151 vaddps %ymm4, %ymm3, %ymm0
152
153 /* Ret H */
154 testl %edx, %edx
155
156 /* Go to special inputs processing branch */
157 jne L(SPECIAL_VALUES_BRANCH)
158 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm1
159
160 /* Restore registers
161 * and exit the function
162 */
163
164 L(EXIT):
165 movq %rbp, %rsp
166 popq %rbp
167 cfi_def_cfa(7, 8)
168 cfi_restore(6)
169 ret
170 cfi_def_cfa(6, 16)
171 cfi_offset(6, -16)
172
173 /* Branch to process
174 * special inputs
175 */
176
177 L(SPECIAL_VALUES_BRANCH):
178 vmovups %ymm1, 32(%rsp)
179 vmovups %ymm0, 64(%rsp)
180 # LOE rbx r12 r13 r14 r15 edx ymm0
181
182 xorl %eax, %eax
183 # LOE rbx r12 r13 r14 r15 eax edx
184
185 vzeroupper
186 movq %r12, 16(%rsp)
187 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
188 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
189 movl %eax, %r12d
190 movq %r13, 8(%rsp)
191 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
192 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
193 movl %edx, %r13d
194 movq %r14, (%rsp)
195 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
196 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
197 # LOE rbx r15 r12d r13d
198
199 /* Range mask
200 * bits check
201 */
202
203 L(RANGEMASK_CHECK):
204 btl %r12d, %r13d
205
206 /* Call scalar math function */
207 jc L(SCALAR_MATH_CALL)
208 # LOE rbx r15 r12d r13d
209
210 /* Special inputs
211 * processing loop
212 */
213
214 L(SPECIAL_VALUES_LOOP):
215 incl %r12d
216 cmpl $8, %r12d
217
218 /* Check bits in range mask */
219 jl L(RANGEMASK_CHECK)
220 # LOE rbx r15 r12d r13d
221
222 movq 16(%rsp), %r12
223 cfi_restore(12)
224 movq 8(%rsp), %r13
225 cfi_restore(13)
226 movq (%rsp), %r14
227 cfi_restore(14)
228 vmovups 64(%rsp), %ymm0
229
230 /* Go to exit */
231 jmp L(EXIT)
232 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
233 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
234 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
235 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
236 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
237 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
238 # LOE rbx r12 r13 r14 r15 ymm0
239
240 /* Scalar math fucntion call
241 * to process special input
242 */
243
244 L(SCALAR_MATH_CALL):
245 movl %r12d, %r14d
246 movss 32(%rsp,%r14,4), %xmm0
247 call coshf@PLT
248 # LOE rbx r14 r15 r12d r13d xmm0
249
250 movss %xmm0, 64(%rsp,%r14,4)
251
252 /* Process special inputs in loop */
253 jmp L(SPECIAL_VALUES_LOOP)
254 # LOE rbx r15 r12d r13d
255 END(_ZGVdN8v_coshf_avx2)
256
257 .section .rodata, "a"
258 .align 32
259
260 #ifdef __svml_scosh_data_internal_typedef
261 typedef unsigned int VUINT32;
262 typedef struct
263 {
264 __declspec(align(32)) VUINT32 _sInvLn2[8][1];
265 __declspec(align(32)) VUINT32 _sLn2hi[8][1];
266 __declspec(align(32)) VUINT32 _sLn2lo[8][1];
267 __declspec(align(32)) VUINT32 _sSign[8][1];
268 __declspec(align(32)) VUINT32 _sShifter[8][1];
269 __declspec(align(32)) VUINT32 _iDomainRange[8][1];
270 __declspec(align(32)) VUINT32 _sPC1[8][1];
271 __declspec(align(32)) VUINT32 _sPC2[8][1];
272 __declspec(align(32)) VUINT32 _sPC3[8][1];
273 __declspec(align(32)) VUINT32 _sPC4[8][1];
274 __declspec(align(32)) VUINT32 _sPC5[8][1];
275 __declspec(align(32)) VUINT32 _sPC6[8][1];
276 __declspec(align(32)) VUINT32 _iHalf[8][1];
277 } __svml_scosh_data_internal;
278 #endif
279 __svml_scosh_data_internal:
280 .long 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ //k=0
281 .align 32
282 .long 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */
283 .align 32
284 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 /* _sLn2lo */
285 .align 32
286 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */
287 .align 32
288 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
289 .align 32
290 .long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */
291 .align 32
292 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */
293 .align 32
294 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */
295 .align 32
296 .long 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */
297 .align 32
298 .long 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72 /* _sPC4 */
299 .align 32
300 .long 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461 /* _sPC5 */
301 .align 32
302 .long 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3 /* _sPC6 */
303 // Integer constants
304 .align 32
305 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _iHalf*/
306 .align 32
307 .type __svml_scosh_data_internal,@object
308 .size __svml_scosh_data_internal,.-__svml_scosh_data_internal