]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/fpu/multiarch/svml_d_cosh2_core_sse4.S
x86-64: Add vector cosh/coshf implementation to libmvec
[thirdparty/glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_d_cosh2_core_sse4.S
1 /* Function cosh vectorized with SSE4.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19 /*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Compute cosh(x) as (exp(x)+exp(-x))/2,
23 * where exp is calculated as
24 * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
25 *
26 * Special cases:
27 *
28 * cosh(NaN) = quiet NaN, and raise invalid exception
29 * cosh(INF) = that INF
30 * cosh(0) = 1
31 * cosh(x) overflows for big x and returns MAXLOG+log(2)
32 *
33 */
34
35 /* Offsets for data table __svml_dcosh_data_internal
36 */
37 #define _dbT 0
38 #define _dbInvLn2 2064
39 #define _dbLn2hi 2080
40 #define _dbLn2lo 2096
41 #define _dbShifter 2112
42 #define _iIndexMask 2128
43 #define _dPC2 2144
44 #define _dPC3 2160
45 #define _dPC4 2176
46 #define _iMaxIndex 2192
47 #define _lExpMask 2208
48 #define _dSign 2224
49 #define _iDomainRange 2240
50
51 #include <sysdep.h>
52
53 .text
54 .section .text.sse4,"ax",@progbits
55 ENTRY(_ZGVbN2v_cosh_sse4)
56 subq $72, %rsp
57 cfi_def_cfa_offset(80)
58 movaps %xmm0, %xmm4
59 movups _dSign+__svml_dcosh_data_internal(%rip), %xmm2
60 lea _dbT+__svml_dcosh_data_internal(%rip), %r8
61
62 /* Abs argument */
63 movaps %xmm2, %xmm5
64
65 /* dXSign=0x001000000000 */
66 psrlq $11, %xmm2
67
68 /*
69 * Load argument
70 * dM = x*2^K/log(2) + RShifter
71 */
72 movups _dbInvLn2+__svml_dcosh_data_internal(%rip), %xmm3
73 andnps %xmm4, %xmm5
74 mulpd %xmm5, %xmm3
75 movups _dbShifter+__svml_dcosh_data_internal(%rip), %xmm1
76 addpd %xmm1, %xmm3
77
78 /*
79 * R
80 * dN = dM - RShifter
81 */
82 movaps %xmm3, %xmm15
83 subpd %xmm1, %xmm15
84
85 /* dR = dX - dN*Log2_hi/2^K */
86 movups _dbLn2hi+__svml_dcosh_data_internal(%rip), %xmm14
87 mulpd %xmm15, %xmm14
88
89 /* dR = (dX - dN*Log2_hi/2^K) - dN*Log2_lo/2^K */
90 movups _dbLn2lo+__svml_dcosh_data_internal(%rip), %xmm1
91 mulpd %xmm15, %xmm1
92
93 /*
94 * Check for overflow\underflow
95 *
96 */
97 pshufd $221, %xmm5, %xmm7
98 subpd %xmm14, %xmm5
99 movq _iIndexMask+__svml_dcosh_data_internal(%rip), %xmm8
100
101 /* Index and lookup */
102 pshufd $136, %xmm3, %xmm9
103
104 /*
105 * G1,G2,G3: dTdif,dTn * 2^N,2^(-N)
106 * NB: copied from sinh_la - to be optimized!!!!!
107 */
108 psllq $44, %xmm3
109
110 /*
111 * trick
112 * 256=-iIndex
113 */
114 movq _iMaxIndex+__svml_dcosh_data_internal(%rip), %xmm12
115 pand %xmm8, %xmm9
116 subpd %xmm1, %xmm5
117 psubd %xmm9, %xmm12
118
119 /* iIndex*=3 */
120 movdqa %xmm9, %xmm10
121
122 /* iDomainRange*=3 */
123 pslld $3, %xmm12
124 pslld $3, %xmm10
125 movd %xmm12, %esi
126 pshufd $1, %xmm12, %xmm13
127 movq _iDomainRange+__svml_dcosh_data_internal(%rip), %xmm6
128 movd %xmm13, %edi
129 pcmpgtd %xmm6, %xmm7
130 movmskps %xmm7, %eax
131
132 /* dR2 = dR^2 */
133 movaps %xmm5, %xmm7
134
135 /* lM now is an EXP(2^N) */
136 pand _lExpMask+__svml_dcosh_data_internal(%rip), %xmm3
137 pshufd $1, %xmm10, %xmm11
138 movslq %esi, %rsi
139 mulpd %xmm5, %xmm7
140 movd %xmm10, %edx
141 movsd (%r8,%rsi), %xmm6
142 movd %xmm11, %ecx
143 movslq %edi, %rdi
144 movslq %edx, %rdx
145 movslq %ecx, %rcx
146 movhpd (%r8,%rdi), %xmm6
147
148 /* */
149 psubq %xmm3, %xmm6
150
151 /* lX- = EXP(1/2) */
152 psubq %xmm2, %xmm6
153
154 /*
155 * sinh(r) = r +r*r^2*a3 ....
156 * dSinh_r = r^2*a3
157 */
158 movups _dPC3+__svml_dcosh_data_internal(%rip), %xmm2
159 mulpd %xmm7, %xmm2
160
161 /* dSinh_r = r + r*r^2*a3 */
162 mulpd %xmm5, %xmm2
163 movsd (%r8,%rdx), %xmm0
164 movhpd (%r8,%rcx), %xmm0
165 paddq %xmm3, %xmm0
166 addpd %xmm2, %xmm5
167
168 /* dTn = dTn*2^N - dTn*2^-N */
169 movaps %xmm0, %xmm3
170 subpd %xmm6, %xmm3
171
172 /* dTp = dTn*2^N + dTn*2^-N */
173 addpd %xmm6, %xmm0
174 mulpd %xmm5, %xmm3
175
176 /* poly(r) = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */
177 movups _dPC4+__svml_dcosh_data_internal(%rip), %xmm5
178 mulpd %xmm7, %xmm5
179 addpd _dPC2+__svml_dcosh_data_internal(%rip), %xmm5
180 mulpd %xmm5, %xmm7
181
182 /* dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */
183 mulpd %xmm0, %xmm7
184 addpd %xmm7, %xmm3
185
186 /* _VRES1 = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */
187 addpd %xmm3, %xmm0
188 andl $3, %eax
189
190 /* Ret H */
191
192 /* Go to special inputs processing branch */
193 jne L(SPECIAL_VALUES_BRANCH)
194 # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm4
195
196 /* Restore registers
197 * and exit the function
198 */
199
200 L(EXIT):
201 addq $72, %rsp
202 cfi_def_cfa_offset(8)
203 ret
204 cfi_def_cfa_offset(80)
205
206 /* Branch to process
207 * special inputs
208 */
209
210 L(SPECIAL_VALUES_BRANCH):
211 movups %xmm4, 32(%rsp)
212 movups %xmm0, 48(%rsp)
213 # LOE rbx rbp r12 r13 r14 r15 eax xmm0
214
215 xorl %edx, %edx
216 movq %r12, 16(%rsp)
217 cfi_offset(12, -64)
218 movl %edx, %r12d
219 movq %r13, 8(%rsp)
220 cfi_offset(13, -72)
221 movl %eax, %r13d
222 movq %r14, (%rsp)
223 cfi_offset(14, -80)
224 # LOE rbx rbp r15 r12d r13d
225
226 /* Range mask
227 * bits check
228 */
229
230 L(RANGEMASK_CHECK):
231 btl %r12d, %r13d
232
233 /* Call scalar math function */
234 jc L(SCALAR_MATH_CALL)
235 # LOE rbx rbp r15 r12d r13d
236
237 /* Special inputs
238 * processing loop
239 */
240
241 L(SPECIAL_VALUES_LOOP):
242 incl %r12d
243 cmpl $2, %r12d
244
245 /* Check bits in range mask */
246 jl L(RANGEMASK_CHECK)
247 # LOE rbx rbp r15 r12d r13d
248
249 movq 16(%rsp), %r12
250 cfi_restore(12)
251 movq 8(%rsp), %r13
252 cfi_restore(13)
253 movq (%rsp), %r14
254 cfi_restore(14)
255 movups 48(%rsp), %xmm0
256
257 /* Go to exit */
258 jmp L(EXIT)
259 cfi_offset(12, -64)
260 cfi_offset(13, -72)
261 cfi_offset(14, -80)
262 # LOE rbx rbp r12 r13 r14 r15 xmm0
263
264 /* Scalar math fucntion call
265 * to process special input
266 */
267
268 L(SCALAR_MATH_CALL):
269 movl %r12d, %r14d
270 movsd 32(%rsp,%r14,8), %xmm0
271 call cosh@PLT
272 # LOE rbx rbp r14 r15 r12d r13d xmm0
273
274 movsd %xmm0, 48(%rsp,%r14,8)
275
276 /* Process special inputs in loop */
277 jmp L(SPECIAL_VALUES_LOOP)
278 # LOE rbx rbp r15 r12d r13d
279 END(_ZGVbN2v_cosh_sse4)
280
281 .section .rodata, "a"
282 .align 16
283
284 #ifdef __svml_dcosh_data_internal_typedef
285 typedef unsigned int VUINT32;
286 typedef struct
287 {
288 __declspec(align(16)) VUINT32 _dbT[(1 + (1<<8))][2]; //dTpj ONLY!
289 __declspec(align(16)) VUINT32 _dbInvLn2[2][2];
290 __declspec(align(16)) VUINT32 _dbLn2hi[2][2];
291 __declspec(align(16)) VUINT32 _dbLn2lo[2][2];
292 __declspec(align(16)) VUINT32 _dbShifter[2][2];
293 __declspec(align(16)) VUINT32 _iIndexMask[4][1]; //(1<<K)1-
294 __declspec(align(16)) VUINT32 _dPC2[2][2];
295 __declspec(align(16)) VUINT32 _dPC3[2][2];
296 __declspec(align(16)) VUINT32 _dPC4[2][2];
297 __declspec(align(16)) VUINT32 _iMaxIndex[4][1]; //(1<<K)
298 __declspec(align(16)) VUINT32 _lExpMask[2][2];
299 __declspec(align(16)) VUINT32 _dSign[2][2]; //0x8000000000000000
300 __declspec(align(16)) VUINT32 _iDomainRange[4][1];
301 } __svml_dcosh_data_internal;
302 #endif
303 __svml_dcosh_data_internal:
304 /*== _dbT ==*/
305 .quad 0x3fe0000000000000, 0x3fe00b1afa5abcbf, 0x3fe0163da9fb3335, 0x3fe02168143b0281
306 .quad 0x3fe02c9a3e778061, 0x3fe037d42e11bbcc, 0x3fe04315e86e7f85, 0x3fe04e5f72f654b1
307 .quad 0x3fe059b0d3158574, 0x3fe0650a0e3c1f89, 0x3fe0706b29ddf6de, 0x3fe07bd42b72a836
308 .quad 0x3fe0874518759bc8, 0x3fe092bdf66607e0, 0x3fe09e3ecac6f383, 0x3fe0a9c79b1f3919
309 .quad 0x3fe0b5586cf9890f, 0x3fe0c0f145e46c85, 0x3fe0cc922b7247f7, 0x3fe0d83b23395dec
310 .quad 0x3fe0e3ec32d3d1a2, 0x3fe0efa55fdfa9c5, 0x3fe0fb66affed31b, 0x3fe1073028d7233e
311 .quad 0x3fe11301d0125b51, 0x3fe11edbab5e2ab6, 0x3fe12abdc06c31cc, 0x3fe136a814f204ab
312 .quad 0x3fe1429aaea92de0, 0x3fe14e95934f312e, 0x3fe15a98c8a58e51, 0x3fe166a45471c3c2
313 .quad 0x3fe172b83c7d517b, 0x3fe17ed48695bbc0, 0x3fe18af9388c8dea, 0x3fe1972658375d2f
314 .quad 0x3fe1a35beb6fcb75, 0x3fe1af99f8138a1c, 0x3fe1bbe084045cd4, 0x3fe1c82f95281c6b
315 .quad 0x3fe1d4873168b9aa, 0x3fe1e0e75eb44027, 0x3fe1ed5022fcd91d, 0x3fe1f9c18438ce4d
316 .quad 0x3fe2063b88628cd6, 0x3fe212be3578a819, 0x3fe21f49917ddc96, 0x3fe22bdda27912d1
317 .quad 0x3fe2387a6e756238, 0x3fe2451ffb82140a, 0x3fe251ce4fb2a63f, 0x3fe25e85711ece75
318 .quad 0x3fe26b4565e27cdd, 0x3fe2780e341ddf29, 0x3fe284dfe1f56381, 0x3fe291ba7591bb70
319 .quad 0x3fe29e9df51fdee1, 0x3fe2ab8a66d10f13, 0x3fe2b87fd0dad990, 0x3fe2c57e39771b2f
320 .quad 0x3fe2d285a6e4030b, 0x3fe2df961f641589, 0x3fe2ecafa93e2f56, 0x3fe2f9d24abd886b
321 .quad 0x3fe306fe0a31b715, 0x3fe31432edeeb2fd, 0x3fe32170fc4cd831, 0x3fe32eb83ba8ea32
322 .quad 0x3fe33c08b26416ff, 0x3fe3496266e3fa2d, 0x3fe356c55f929ff1, 0x3fe36431a2de883b
323 .quad 0x3fe371a7373aa9cb, 0x3fe37f26231e754a, 0x3fe38cae6d05d866, 0x3fe39a401b7140ef
324 .quad 0x3fe3a7db34e59ff7, 0x3fe3b57fbfec6cf4, 0x3fe3c32dc313a8e5, 0x3fe3d0e544ede173
325 .quad 0x3fe3dea64c123422, 0x3fe3ec70df1c5175, 0x3fe3fa4504ac801c, 0x3fe40822c367a024
326 .quad 0x3fe4160a21f72e2a, 0x3fe423fb2709468a, 0x3fe431f5d950a897, 0x3fe43ffa3f84b9d4
327 .quad 0x3fe44e086061892d, 0x3fe45c2042a7d232, 0x3fe46a41ed1d0057, 0x3fe4786d668b3237
328 .quad 0x3fe486a2b5c13cd0, 0x3fe494e1e192aed2, 0x3fe4a32af0d7d3de, 0x3fe4b17dea6db7d7
329 .quad 0x3fe4bfdad5362a27, 0x3fe4ce41b817c114, 0x3fe4dcb299fddd0d, 0x3fe4eb2d81d8abff
330 .quad 0x3fe4f9b2769d2ca7, 0x3fe508417f4531ee, 0x3fe516daa2cf6642, 0x3fe5257de83f4eef
331 .quad 0x3fe5342b569d4f82, 0x3fe542e2f4f6ad27, 0x3fe551a4ca5d920f, 0x3fe56070dde910d2
332 .quad 0x3fe56f4736b527da, 0x3fe57e27dbe2c4cf, 0x3fe58d12d497c7fd, 0x3fe59c0827ff07cc
333 .quad 0x3fe5ab07dd485429, 0x3fe5ba11fba87a03, 0x3fe5c9268a5946b7, 0x3fe5d84590998b93
334 .quad 0x3fe5e76f15ad2148, 0x3fe5f6a320dceb71, 0x3fe605e1b976dc09, 0x3fe6152ae6cdf6f4
335 .quad 0x3fe6247eb03a5585, 0x3fe633dd1d1929fd, 0x3fe6434634ccc320, 0x3fe652b9febc8fb7
336 .quad 0x3fe6623882552225, 0x3fe671c1c70833f6, 0x3fe68155d44ca973, 0x3fe690f4b19e9538
337 .quad 0x3fe6a09e667f3bcd, 0x3fe6b052fa75173e, 0x3fe6c012750bdabf, 0x3fe6cfdcddd47645
338 .quad 0x3fe6dfb23c651a2f, 0x3fe6ef9298593ae5, 0x3fe6ff7df9519484, 0x3fe70f7466f42e87
339 .quad 0x3fe71f75e8ec5f74, 0x3fe72f8286ead08a, 0x3fe73f9a48a58174, 0x3fe74fbd35d7cbfd
340 .quad 0x3fe75feb564267c9, 0x3fe77024b1ab6e09, 0x3fe780694fde5d3f, 0x3fe790b938ac1cf6
341 .quad 0x3fe7a11473eb0187, 0x3fe7b17b0976cfdb, 0x3fe7c1ed0130c132, 0x3fe7d26a62ff86f0
342 .quad 0x3fe7e2f336cf4e62, 0x3fe7f3878491c491, 0x3fe80427543e1a12, 0x3fe814d2add106d9
343 .quad 0x3fe82589994cce13, 0x3fe8364c1eb941f7, 0x3fe8471a4623c7ad, 0x3fe857f4179f5b21
344 .quad 0x3fe868d99b4492ed, 0x3fe879cad931a436, 0x3fe88ac7d98a6699, 0x3fe89bd0a478580f
345 .quad 0x3fe8ace5422aa0db, 0x3fe8be05bad61778, 0x3fe8cf3216b5448c, 0x3fe8e06a5e0866d9
346 .quad 0x3fe8f1ae99157736, 0x3fe902fed0282c8a, 0x3fe9145b0b91ffc6, 0x3fe925c353aa2fe2
347 .quad 0x3fe93737b0cdc5e5, 0x3fe948b82b5f98e5, 0x3fe95a44cbc8520f, 0x3fe96bdd9a7670b3
348 .quad 0x3fe97d829fde4e50, 0x3fe98f33e47a22a2, 0x3fe9a0f170ca07ba, 0x3fe9b2bb4d53fe0d
349 .quad 0x3fe9c49182a3f090, 0x3fe9d674194bb8d5, 0x3fe9e86319e32323, 0x3fe9fa5e8d07f29e
350 .quad 0x3fea0c667b5de565, 0x3fea1e7aed8eb8bb, 0x3fea309bec4a2d33, 0x3fea42c980460ad8
351 .quad 0x3fea5503b23e255d, 0x3fea674a8af46052, 0x3fea799e1330b358, 0x3fea8bfe53c12e59
352 .quad 0x3fea9e6b5579fdbf, 0x3feab0e521356eba, 0x3feac36bbfd3f37a, 0x3fead5ff3a3c2774
353 .quad 0x3feae89f995ad3ad, 0x3feafb4ce622f2ff, 0x3feb0e07298db666, 0x3feb20ce6c9a8952
354 .quad 0x3feb33a2b84f15fb, 0x3feb468415b749b1, 0x3feb59728de5593a, 0x3feb6c6e29f1c52a
355 .quad 0x3feb7f76f2fb5e47, 0x3feb928cf22749e4, 0x3feba5b030a1064a, 0x3febb8e0b79a6f1f
356 .quad 0x3febcc1e904bc1d2, 0x3febdf69c3f3a207, 0x3febf2c25bd71e09, 0x3fec06286141b33d
357 .quad 0x3fec199bdd85529c, 0x3fec2d1cd9fa652c, 0x3fec40ab5fffd07a, 0x3fec544778fafb22
358 .quad 0x3fec67f12e57d14b, 0x3fec7ba88988c933, 0x3fec8f6d9406e7b5, 0x3feca3405751c4db
359 .quad 0x3fecb720dcef9069, 0x3feccb0f2e6d1675, 0x3fecdf0b555dc3fa, 0x3fecf3155b5bab74
360 .quad 0x3fed072d4a07897c, 0x3fed1b532b08c968, 0x3fed2f87080d89f2, 0x3fed43c8eacaa1d6
361 .quad 0x3fed5818dcfba487, 0x3fed6c76e862e6d3, 0x3fed80e316c98398, 0x3fed955d71ff6075
362 .quad 0x3feda9e603db3285, 0x3fedbe7cd63a8315, 0x3fedd321f301b460, 0x3fede7d5641c0658
363 .quad 0x3fedfc97337b9b5f, 0x3fee11676b197d17, 0x3fee264614f5a129, 0x3fee3b333b16ee12
364 .quad 0x3fee502ee78b3ff6, 0x3fee653924676d76, 0x3fee7a51fbc74c83, 0x3fee8f7977cdb740
365 .quad 0x3feea4afa2a490da, 0x3feeb9f4867cca6e, 0x3feecf482d8e67f1, 0x3feee4aaa2188510
366 .quad 0x3feefa1bee615a27, 0x3fef0f9c1cb6412a, 0x3fef252b376bba97, 0x3fef3ac948dd7274
367 .quad 0x3fef50765b6e4540, 0x3fef6632798844f8, 0x3fef7bfdad9cbe14, 0x3fef91d802243c89
368 .quad 0x3fefa7c1819e90d8, 0x3fefbdba3692d514, 0x3fefd3c22b8f71f1, 0x3fefe9d96b2a23d9
369 .quad 0x3ff0000000000000
370 .align 16
371 .quad 0x3ff71547652b82fe, 0x3ff71547652b82fe /* _dbInvLn2 = 1/log(2) */
372 .align 16
373 .quad 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000 /* _dbLn2hi = log(2) hi*/
374 .align 16
375 .quad 0xBDAC610CA86C3899, 0xBDAC610CA86C3899 /* _dbLn2lo = log(2) lo*/
376 .align 16
377 .quad 0x42B8000000000000, 0x42B8000000000000 /* _dbShifter */
378 .align 16
379 .long 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF /* _iIndexMask */
380 .align 16
381 .quad 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD /* _dPC2 */
382 .align 16
383 .quad 0x3FC5555570813E14, 0x3FC5555570813E14 /* _dPC3 */
384 .align 16
385 .quad 0x3FA55555CF16D299, 0x3FA55555CF16D299 /* _dPC4 */
386 .align 16
387 .long 0x00000100, 0x00000100, 0x00000100, 0x00000100 /* _iMaxIndex */
388 .align 16
389 .quad 0x7ff0000000000000, 0x7ff0000000000000 /* _lExpMask */
390 .align 16
391 .quad 0x8000000000000000, 0x8000000000000000 /* _dSign*/
392 .align 16
393 .long 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99 /* _iDomainRange 0x40861d9ac12a3e85 =(1021*2^K-0.5)*log(2)/2^K -needed for quick exp*/
394 .align 16
395 .type __svml_dcosh_data_internal,@object
396 .size __svml_dcosh_data_internal,.-__svml_dcosh_data_internal