]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/fpu/multiarch/svml_s_coshf4_core_sse4.S
x86-64: Add vector cosh/coshf implementation to libmvec
[thirdparty/glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_s_coshf4_core_sse4.S
1 /* Function coshf vectorized with SSE4.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19 /*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Compute cosh(x) as (exp(x)+exp(-x))/2,
23 * where exp is calculated as
24 * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
25 *
26 * Special cases:
27 *
28 * cosh(NaN) = quiet NaN, and raise invalid exception
29 * cosh(INF) = that INF
30 * cosh(0) = 1
31 * cosh(x) overflows for big x and returns MAXLOG+log(2)
32 *
33 */
34
35 /* Offsets for data table __svml_scosh_data_internal
36 */
37 #define _sInvLn2 0
38 #define _sLn2hi 16
39 #define _sLn2lo 32
40 #define _sSign 48
41 #define _sShifter 64
42 #define _iDomainRange 80
43 #define _sPC1 96
44 #define _sPC2 112
45 #define _sPC3 128
46 #define _sPC4 144
47 #define _sPC5 160
48 #define _sPC6 176
49 #define _iHalf 192
50
51 #include <sysdep.h>
52
53 .text
54 .section .text.sse4,"ax",@progbits
55 ENTRY(_ZGVbN4v_coshf_sse4)
56 subq $72, %rsp
57 cfi_def_cfa_offset(80)
58
59 /*
60 * Implementation
61 * Abs argument
62 */
63 movups _sSign+__svml_scosh_data_internal(%rip), %xmm1
64
65 /*
66 * Load argument
67 * dM = x/log(2) + RShifter
68 */
69 movups _sInvLn2+__svml_scosh_data_internal(%rip), %xmm9
70 andnps %xmm0, %xmm1
71 mulps %xmm1, %xmm9
72
73 /* Check for overflow\underflow */
74 movaps %xmm1, %xmm3
75 movups _sShifter+__svml_scosh_data_internal(%rip), %xmm4
76 movups _sLn2hi+__svml_scosh_data_internal(%rip), %xmm5
77 addps %xmm4, %xmm9
78
79 /*
80 * R
81 * sN = sM - RShifter
82 */
83 movaps %xmm9, %xmm6
84
85 /*
86 * G1,G2 2^N,2^(-N)
87 * iM now is an EXP(2^N)
88 */
89 pslld $23, %xmm9
90 movups _sLn2lo+__svml_scosh_data_internal(%rip), %xmm7
91 subps %xmm4, %xmm6
92
93 /* sR = sX - sN*Log2_hi */
94 mulps %xmm6, %xmm5
95
96 /* sR = (sX - sN*Log2_hi) - sN*Log2_lo */
97 mulps %xmm6, %xmm7
98 movdqu _iDomainRange+__svml_scosh_data_internal(%rip), %xmm2
99 pcmpgtd %xmm2, %xmm3
100 pcmpeqd %xmm1, %xmm2
101
102 /*
103 * sinh(r) = r*((a1=1)+r^2*(a3+r^2*(a5+{v1 r^2*a7})))) = r + r*(r^2*(a3+r^2*(a5+r^2*a7))) ....
104 * sSinh_r = (a3+r^2*a5)
105 */
106 movups _sPC5+__svml_scosh_data_internal(%rip), %xmm10
107 por %xmm2, %xmm3
108
109 /*
110 * sinh(X) = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2)
111 * sOut = (a4 +a6*sR2)
112 */
113 movups _sPC6+__svml_scosh_data_internal(%rip), %xmm11
114 subps %xmm5, %xmm1
115 movmskps %xmm3, %edx
116 movdqu _iHalf+__svml_scosh_data_internal(%rip), %xmm8
117 subps %xmm7, %xmm1
118
119 /* sR2 = sR^2,shaffled */
120 movaps %xmm1, %xmm13
121 movdqa %xmm8, %xmm2
122 mulps %xmm1, %xmm13
123 paddd %xmm9, %xmm2
124 mulps %xmm13, %xmm10
125 psubd %xmm9, %xmm8
126 mulps %xmm13, %xmm11
127 addps _sPC3+__svml_scosh_data_internal(%rip), %xmm10
128 addps _sPC4+__svml_scosh_data_internal(%rip), %xmm11
129
130 /* sSinh_r = r^2*(a3+r^2*a5) */
131 mulps %xmm13, %xmm10
132
133 /* sOut = a2+sR2*(a4+a6*sR2) */
134 mulps %xmm13, %xmm11
135
136 /* sSinh_r = r + r*(r^2*(a3+r^2*a5)) */
137 mulps %xmm1, %xmm10
138 addps _sPC2+__svml_scosh_data_internal(%rip), %xmm11
139 addps %xmm10, %xmm1
140
141 /* sOut = sR2*(a2+sR2*(a4+a6*sR2) */
142 mulps %xmm11, %xmm13
143
144 /* sG1 = 2^(N-1)-2^(-N-1) */
145 movdqa %xmm2, %xmm12
146
147 /* sG2 = 2^(N-1)+2^(-N-1) */
148 addps %xmm8, %xmm2
149 subps %xmm8, %xmm12
150
151 /* sOut = sG2*sR2*(a2+sR2*(a4+a6*sR2) */
152 mulps %xmm2, %xmm13
153
154 /* sOut = sG1*sinh(dR)+sG2*sR2*(a2+sR2*(a4+a6*sR2) */
155 mulps %xmm1, %xmm12
156 addps %xmm12, %xmm13
157
158 /* sOut = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2) */
159 addps %xmm13, %xmm2
160
161 /* Ret H */
162 testl %edx, %edx
163
164 /* Go to special inputs processing branch */
165 jne L(SPECIAL_VALUES_BRANCH)
166 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm2
167
168 /* Restore registers
169 * and exit the function
170 */
171
172 L(EXIT):
173 movaps %xmm2, %xmm0
174 addq $72, %rsp
175 cfi_def_cfa_offset(8)
176 ret
177 cfi_def_cfa_offset(80)
178
179 /* Branch to process
180 * special inputs
181 */
182
183 L(SPECIAL_VALUES_BRANCH):
184 movups %xmm0, 32(%rsp)
185 movups %xmm2, 48(%rsp)
186 # LOE rbx rbp r12 r13 r14 r15 edx
187
188 xorl %eax, %eax
189 movq %r12, 16(%rsp)
190 cfi_offset(12, -64)
191 movl %eax, %r12d
192 movq %r13, 8(%rsp)
193 cfi_offset(13, -72)
194 movl %edx, %r13d
195 movq %r14, (%rsp)
196 cfi_offset(14, -80)
197 # LOE rbx rbp r15 r12d r13d
198
199 /* Range mask
200 * bits check
201 */
202
203 L(RANGEMASK_CHECK):
204 btl %r12d, %r13d
205
206 /* Call scalar math function */
207 jc L(SCALAR_MATH_CALL)
208 # LOE rbx rbp r15 r12d r13d
209
210 /* Special inputs
211 * processing loop
212 */
213
214 L(SPECIAL_VALUES_LOOP):
215 incl %r12d
216 cmpl $4, %r12d
217
218 /* Check bits in range mask */
219 jl L(RANGEMASK_CHECK)
220 # LOE rbx rbp r15 r12d r13d
221
222 movq 16(%rsp), %r12
223 cfi_restore(12)
224 movq 8(%rsp), %r13
225 cfi_restore(13)
226 movq (%rsp), %r14
227 cfi_restore(14)
228 movups 48(%rsp), %xmm2
229
230 /* Go to exit */
231 jmp L(EXIT)
232 cfi_offset(12, -64)
233 cfi_offset(13, -72)
234 cfi_offset(14, -80)
235 # LOE rbx rbp r12 r13 r14 r15 xmm2
236
237 /* Scalar math fucntion call
238 * to process special input
239 */
240
241 L(SCALAR_MATH_CALL):
242 movl %r12d, %r14d
243 movss 32(%rsp,%r14,4), %xmm0
244 call coshf@PLT
245 # LOE rbx rbp r14 r15 r12d r13d xmm0
246
247 movss %xmm0, 48(%rsp,%r14,4)
248
249 /* Process special inputs in loop */
250 jmp L(SPECIAL_VALUES_LOOP)
251 # LOE rbx rbp r15 r12d r13d
252 END(_ZGVbN4v_coshf_sse4)
253
254 .section .rodata, "a"
255 .align 16
256
257 #ifdef __svml_scosh_data_internal_typedef
258 typedef unsigned int VUINT32;
259 typedef struct
260 {
261 __declspec(align(16)) VUINT32 _sInvLn2[4][1];
262 __declspec(align(16)) VUINT32 _sLn2hi[4][1];
263 __declspec(align(16)) VUINT32 _sLn2lo[4][1];
264 __declspec(align(16)) VUINT32 _sSign[4][1];
265 __declspec(align(16)) VUINT32 _sShifter[4][1];
266 __declspec(align(16)) VUINT32 _iDomainRange[4][1];
267 __declspec(align(16)) VUINT32 _sPC1[4][1];
268 __declspec(align(16)) VUINT32 _sPC2[4][1];
269 __declspec(align(16)) VUINT32 _sPC3[4][1];
270 __declspec(align(16)) VUINT32 _sPC4[4][1];
271 __declspec(align(16)) VUINT32 _sPC5[4][1];
272 __declspec(align(16)) VUINT32 _sPC6[4][1];
273 __declspec(align(16)) VUINT32 _iHalf[4][1];
274 } __svml_scosh_data_internal;
275 #endif
276 __svml_scosh_data_internal:
277 .long 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ //k=0
278 .align 16
279 .long 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */
280 .align 16
281 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 /* _sLn2lo */
282 .align 16
283 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */
284 .align 16
285 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
286 .align 16
287 .long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */
288 .align 16
289 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */
290 .align 16
291 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */
292 .align 16
293 .long 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */
294 .align 16
295 .long 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72 /* _sPC4 */
296 .align 16
297 .long 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461 /* _sPC5 */
298 .align 16
299 .long 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3 /* _sPC6 */
300 // Integer constants
301 .align 16
302 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _iHalf*/
303 .align 16
304 .type __svml_scosh_data_internal,@object
305 .size __svml_scosh_data_internal,.-__svml_scosh_data_internal