]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/fpu/multiarch/svml_s_expm1f4_core_sse4.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_s_expm1f4_core_sse4.S
CommitLineData
76ddc74e 1/* Function expm1f vectorized with SSE4.
581c785b 2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
76ddc74e
SP
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * N = (int)(x*2^k/log(2.0)), R = x - N*log(2)/2^k
23 * exp(x) = 2^(N/2^k) * poly(R) is computed in high-low parts
24 * expm1(x) = exp(x)-1 is then obtained via multi-precision computation
25 *
26 *
27 */
28
29/* Offsets for data table __svml_sexpm1_data_internal
30 */
31#define Expm1_HA_table 0
32#define poly_coeff 512
33#define Log2e 576
34#define L2H 592
35#define L2L 608
36#define ExpAddConst 624
37#define IndexMask 640
38#define ExpMask 656
39#define MOne 672
40#define AbsMask 688
41#define Threshold 704
42#define L2 720
43
44#include <sysdep.h>
45
46 .text
47 .section .text.sse4,"ax",@progbits
48ENTRY(_ZGVbN4v_expm1f_sse4)
49 pushq %rbp
50 cfi_def_cfa_offset(16)
51 movq %rsp, %rbp
52 cfi_def_cfa(6, 16)
53 cfi_offset(6, -16)
54 andq $-32, %rsp
55 subq $64, %rsp
56 movaps %xmm0, %xmm4
57 movups Log2e+__svml_sexpm1_data_internal(%rip), %xmm9
58 lea __svml_sexpm1_data_internal(%rip), %r8
59 mulps %xmm0, %xmm9
60 movups .FLT_10(%rip), %xmm5
61 movups ExpAddConst+__svml_sexpm1_data_internal(%rip), %xmm2
62 addps %xmm5, %xmm9
63
64/* argument reduction */
65 movups L2H+__svml_sexpm1_data_internal(%rip), %xmm6
66 subps %xmm5, %xmm9
67 mulps %xmm9, %xmm6
68 addps %xmm9, %xmm2
69
70/* table lookup */
71 movdqu IndexMask+__svml_sexpm1_data_internal(%rip), %xmm12
72 subps %xmm6, %xmm4
73 pand %xmm2, %xmm12
74 movups L2L+__svml_sexpm1_data_internal(%rip), %xmm7
75 movups AbsMask+__svml_sexpm1_data_internal(%rip), %xmm3
76 pshufd $1, %xmm12, %xmm10
77 movaps %xmm3, %xmm8
78 mulps %xmm9, %xmm7
79 andps %xmm0, %xmm8
80 cmpnleps Threshold+__svml_sexpm1_data_internal(%rip), %xmm8
81 movd %xmm12, %edx
82 subps %xmm7, %xmm4
83 movd %xmm10, %ecx
84 movmskps %xmm8, %eax
85 pshufd $2, %xmm12, %xmm11
86 movaps %xmm4, %xmm7
87 pshufd $3, %xmm12, %xmm13
88 andnps %xmm0, %xmm3
89 movd %xmm11, %esi
90 movd %xmm13, %edi
91
92/* polynomial */
93 movups poly_coeff+__svml_sexpm1_data_internal(%rip), %xmm8
94 movdqu ExpMask+__svml_sexpm1_data_internal(%rip), %xmm6
95 movslq %edx, %rdx
96 pand %xmm6, %xmm2
97 movslq %ecx, %rcx
98 pslld $14, %xmm2
99 movslq %esi, %rsi
100 movslq %edi, %rdi
101 movq (%r8,%rdx), %xmm1
102 movq (%r8,%rcx), %xmm14
103 movq (%r8,%rsi), %xmm5
104 movq (%r8,%rdi), %xmm15
105 unpcklps %xmm14, %xmm1
106 mulps %xmm4, %xmm8
107 movaps %xmm1, %xmm10
108 mulps %xmm4, %xmm7
109 addps poly_coeff+16+__svml_sexpm1_data_internal(%rip), %xmm8
110 unpcklps %xmm15, %xmm5
111 movlhps %xmm5, %xmm10
112 shufps $238, %xmm5, %xmm1
113 orps %xmm2, %xmm10
114
115/* T-1 */
116 movups MOne+__svml_sexpm1_data_internal(%rip), %xmm9
117 mulps %xmm2, %xmm1
118 addps %xmm9, %xmm10
119 mulps %xmm7, %xmm8
120 addps %xmm1, %xmm10
121 addps %xmm8, %xmm4
122 movaps %xmm10, %xmm1
123 subps %xmm9, %xmm1
124 mulps %xmm1, %xmm4
125 addps %xmm4, %xmm10
126 orps %xmm3, %xmm10
127 testl %eax, %eax
128
129/* Go to special inputs processing branch */
130 jne L(SPECIAL_VALUES_BRANCH)
131 # LOE rbx r12 r13 r14 r15 eax xmm0 xmm10
132
133/* Restore registers
134 * and exit the function
135 */
136
137L(EXIT):
138 movaps %xmm10, %xmm0
139 movq %rbp, %rsp
140 popq %rbp
141 cfi_def_cfa(7, 8)
142 cfi_restore(6)
143 ret
144 cfi_def_cfa(6, 16)
145 cfi_offset(6, -16)
146
147/* Branch to process
148 * special inputs
149 */
150
151L(SPECIAL_VALUES_BRANCH):
152 movups %xmm0, 32(%rsp)
153 movups %xmm10, 48(%rsp)
154 # LOE rbx r12 r13 r14 r15 eax
155
156 xorl %edx, %edx
157 movq %r12, 16(%rsp)
158 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -48; DW_OP_plus) */
159 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd0, 0xff, 0xff, 0xff, 0x22
160 movl %edx, %r12d
161 movq %r13, 8(%rsp)
162 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -56; DW_OP_plus) */
163 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xff, 0xff, 0xff, 0x22
164 movl %eax, %r13d
165 movq %r14, (%rsp)
166 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -64; DW_OP_plus) */
167 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22
168 # LOE rbx r15 r12d r13d
169
170/* Range mask
171 * bits check
172 */
173
174L(RANGEMASK_CHECK):
175 btl %r12d, %r13d
176
177/* Call scalar math function */
178 jc L(SCALAR_MATH_CALL)
179 # LOE rbx r15 r12d r13d
180
181/* Special inputs
182 * processing loop
183 */
184
185L(SPECIAL_VALUES_LOOP):
186 incl %r12d
187 cmpl $4, %r12d
188
189/* Check bits in range mask */
190 jl L(RANGEMASK_CHECK)
191 # LOE rbx r15 r12d r13d
192
193 movq 16(%rsp), %r12
194 cfi_restore(12)
195 movq 8(%rsp), %r13
196 cfi_restore(13)
197 movq (%rsp), %r14
198 cfi_restore(14)
199 movups 48(%rsp), %xmm10
200
201/* Go to exit */
202 jmp L(EXIT)
203 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -48; DW_OP_plus) */
204 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd0, 0xff, 0xff, 0xff, 0x22
205 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -56; DW_OP_plus) */
206 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xff, 0xff, 0xff, 0x22
207 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -64; DW_OP_plus) */
208 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22
209 # LOE rbx r12 r13 r14 r15 xmm10
210
211/* Scalar math fucntion call
212 * to process special input
213 */
214
215L(SCALAR_MATH_CALL):
216 movl %r12d, %r14d
217 movss 32(%rsp,%r14,4), %xmm0
218 call expm1f@PLT
219 # LOE rbx r14 r15 r12d r13d xmm0
220
221 movss %xmm0, 48(%rsp,%r14,4)
222
223/* Process special inputs in loop */
224 jmp L(SPECIAL_VALUES_LOOP)
225 # LOE rbx r15 r12d r13d
226END(_ZGVbN4v_expm1f_sse4)
227
228 .section .rodata, "a"
229 .align 16
230
231#ifdef __svml_sexpm1_data_internal_typedef
232typedef unsigned int VUINT32;
233typedef struct {
234 __declspec(align(16)) VUINT32 Expm1_HA_table[(1<<7)][1];
235 __declspec(align(16)) VUINT32 poly_coeff[4][4][1];
236 __declspec(align(16)) VUINT32 Log2e[4][1];
237 __declspec(align(16)) VUINT32 L2H[4][1];
238 __declspec(align(16)) VUINT32 L2L[4][1];
239 __declspec(align(16)) VUINT32 ExpAddConst[4][1];
240 __declspec(align(16)) VUINT32 IndexMask[4][1];
241 __declspec(align(16)) VUINT32 ExpMask[4][1];
242 __declspec(align(16)) VUINT32 MOne[4][1];
243 __declspec(align(16)) VUINT32 AbsMask[4][1];
244 __declspec(align(16)) VUINT32 Threshold[4][1];
245 __declspec(align(16)) VUINT32 L2[4][1];
246} __svml_sexpm1_data_internal;
247#endif
248__svml_sexpm1_data_internal:
249 /* Expm1_HA_table */
250 .long 0x00000000, 0x00000000
251 .long 0x00016000, 0x391a3e78
252 .long 0x0002d000, 0xb89e59d5
253 .long 0x00044000, 0xb93ae78a
254 .long 0x0005b000, 0xb9279306
255 .long 0x00072000, 0xb79e6961
256 .long 0x0008a000, 0xb97e2fee
257 .long 0x000a1000, 0x391aaea9
258 .long 0x000b9000, 0x39383c7d
259 .long 0x000d2000, 0xb9241490
260 .long 0x000ea000, 0x39073169
261 .long 0x00103000, 0x386e218a
262 .long 0x0011c000, 0x38f4dceb
263 .long 0x00136000, 0xb93a9a1e
264 .long 0x0014f000, 0x391df520
265 .long 0x00169000, 0x3905a6e4
266 .long 0x00183000, 0x397e0a32
267 .long 0x0019e000, 0x370b2641
268 .long 0x001b9000, 0xb8b1918b
269 .long 0x001d4000, 0xb8132c6a
270 .long 0x001ef000, 0x39264c12
271 .long 0x0020b000, 0x37221f73
272 .long 0x00227000, 0x37060619
273 .long 0x00243000, 0x3922b5c1
274 .long 0x00260000, 0xb814ab27
275 .long 0x0027d000, 0xb89b12c6
276 .long 0x0029a000, 0x382d5a75
277 .long 0x002b8000, 0xb938c94b
278 .long 0x002d6000, 0xb97822b8
279 .long 0x002f4000, 0xb910ea53
280 .long 0x00312000, 0x38fd6075
281 .long 0x00331000, 0x38620955
282 .long 0x00350000, 0x391e667f
283 .long 0x00370000, 0xb89b8736
284 .long 0x00390000, 0xb90a1714
285 .long 0x003b0000, 0xb7a54ded
286 .long 0x003d1000, 0xb96b8c15
287 .long 0x003f1000, 0x397336cf
288 .long 0x00413000, 0xb8eccd66
289 .long 0x00434000, 0x39599b45
290 .long 0x00456000, 0x3965422b
291 .long 0x00479000, 0xb8a2cdd5
292 .long 0x0049c000, 0xb9484f32
293 .long 0x004bf000, 0xb8fac043
294 .long 0x004e2000, 0x391182a4
295 .long 0x00506000, 0x38ccf6bc
296 .long 0x0052b000, 0xb97c4dc2
297 .long 0x0054f000, 0x38d6aaf4
298 .long 0x00574000, 0x391f995b
299 .long 0x0059a000, 0xb8ba8f62
300 .long 0x005c0000, 0xb9090d05
301 .long 0x005e6000, 0x37f4825e
302 .long 0x0060d000, 0xb8c844f5
303 .long 0x00634000, 0xb76d1a83
304 .long 0x0065c000, 0xb95f2310
305 .long 0x00684000, 0xb952b5f8
306 .long 0x006ac000, 0x37c6e7dd
307 .long 0x006d5000, 0xb7cfe126
308 .long 0x006fe000, 0x3917337c
309 .long 0x00728000, 0x383b9e2d
310 .long 0x00752000, 0x392fa2a5
311 .long 0x0077d000, 0x37df730b
312 .long 0x007a8000, 0x38ecb6dd
313 .long 0x007d4000, 0xb879f986
314 /*== poly_coeff[4] ==*/
315 .align 16
316 .long 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF /* coeff3 */
317 .long 0x3f00000F, 0x3f00000F, 0x3f00000F, 0x3f00000F /* coeff2 */
318 /* 32 Byte Padding */
319 .zero 32
320 /*== Log2e ==*/
321 .align 16
322 .long 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B
323 /*== L2H ==*/
324 .align 16
325 .long 0x3c318000, 0x3c318000, 0x3c318000, 0x3c318000
326 /*== L2L ==*/
327 .align 16
328 .long 0xb65e8083, 0xb65e8083, 0xb65e8083, 0xb65e8083
329 /*== ExpAddConst ==*/
330 .align 16
331 .long 0x49f0fe00, 0x49f0fe00, 0x49f0fe00, 0x49f0fe00
332 /*== IndexMask ==*/
333 .align 16
334 .long 0x000001f8, 0x000001f8, 0x000001f8, 0x000001f8
335 /*== ExpMask ==*/
336 .align 16
337 .long 0x0001fe00, 0x0001fe00, 0x0001fe00, 0x0001fe00
338 /*== MOne ==*/
339 .align 16
340 .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
341 /*== AbsMask ==*/
342 .align 16
343 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
344 /*== Threshold ==*/
345 .align 16
346 .long 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B // 86.643394
347 /*== L2 ==*/
348 .align 16
349 .long 0x3cb17218, 0x3cb17218, 0x3cb17218, 0x3cb17218
350 .align 16
351 .type __svml_sexpm1_data_internal,@object
352 .size __svml_sexpm1_data_internal,.-__svml_sexpm1_data_internal
353 .align 16
354
355.FLT_10:
356 .long 0x4b400000,0x4b400000,0x4b400000,0x4b400000
357 .type .FLT_10,@object
358 .size .FLT_10,16