]>
Commit | Line | Data |
---|---|---|
f9ce13fd | 1 | /* Function erff vectorized with AVX-512. |
6d7e8eda | 2 | Copyright (C) 2021-2023 Free Software Foundation, Inc. |
f9ce13fd SP |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | https://www.gnu.org/licenses/. */ | |
18 | ||
19 | /* | |
20 | * ALGORITHM DESCRIPTION: | |
21 | * | |
22 | * erf(x) is computed as higher precision simple polynomial | |
23 | * with no lookup table: | |
24 | * | |
25 | * R = P0 + x^2*(P1 + x^2*(P2 + .... x^2*P12)); | |
26 | * erf(x) = R * R * x; | |
27 | * | |
28 | * Special cases: | |
29 | * | |
30 | * erf(0) = 0 | |
31 | * erf(+INF) = +1 | |
32 | * erf(-INF) = -1 | |
33 | * erf(QNaN) = QNaN | |
34 | * erf(SNaN) = QNaN | |
35 | * | |
36 | */ | |
37 | ||
38 | /* Offsets for data table __svml_serf_data_internal | |
39 | */ | |
589a73ac SP |
40 | #define _AbsMask 0 |
41 | #define _One 64 | |
42 | #define _gf_MaxThreshold_LA 128 | |
43 | #define _gf_la_poly_0 192 | |
44 | #define _gf_la_poly_1 256 | |
45 | #define _gf_la_poly_2 320 | |
46 | #define _gf_la_poly_3 384 | |
47 | #define _gf_la_poly_4 448 | |
48 | #define _gf_la_poly_5 512 | |
49 | #define _gf_la_poly_6 576 | |
50 | #define _gf_la_poly_7 640 | |
51 | #define _gf_la_poly_8 704 | |
52 | #define _gf_la_poly_9 768 | |
53 | #define _gf_la_poly_10 832 | |
54 | #define _gf_la_poly_11 896 | |
55 | #define _gf_la_poly_12 960 | |
f9ce13fd SP |
56 | |
57 | #include <sysdep.h> | |
58 | ||
95177b78 | 59 | .section .text.evex512, "ax", @progbits |
f9ce13fd | 60 | ENTRY(_ZGVeN16v_erff_skx) |
589a73ac SP |
61 | vmovaps %zmm0, %zmm8 |
62 | vmulps {rn-sae}, %zmm8, %zmm8, %zmm11 | |
63 | vmovups _gf_la_poly_11+__svml_serf_data_internal(%rip), %zmm15 | |
64 | vmovups _gf_la_poly_12+__svml_serf_data_internal(%rip), %zmm10 | |
65 | vmovups _gf_la_poly_10+__svml_serf_data_internal(%rip), %zmm9 | |
66 | vmovups _gf_la_poly_9+__svml_serf_data_internal(%rip), %zmm7 | |
67 | vmovups _gf_la_poly_8+__svml_serf_data_internal(%rip), %zmm0 | |
68 | vmovups _gf_la_poly_7+__svml_serf_data_internal(%rip), %zmm1 | |
69 | vmovups _gf_la_poly_6+__svml_serf_data_internal(%rip), %zmm2 | |
70 | vmovups _gf_la_poly_5+__svml_serf_data_internal(%rip), %zmm3 | |
71 | vmovups _gf_la_poly_4+__svml_serf_data_internal(%rip), %zmm4 | |
72 | vmovups _gf_la_poly_3+__svml_serf_data_internal(%rip), %zmm5 | |
73 | vmovups _gf_la_poly_2+__svml_serf_data_internal(%rip), %zmm6 | |
74 | vextractf32x8 $1, %zmm8, %ymm13 | |
75 | vcvtps2pd {sae}, %ymm8, %zmm12 | |
76 | vcvtps2pd {sae}, %ymm13, %zmm14 | |
77 | vmulpd {rn-sae}, %zmm12, %zmm12, %zmm12 | |
78 | vmulpd {rn-sae}, %zmm14, %zmm14, %zmm13 | |
f9ce13fd | 79 | |
589a73ac SP |
80 | /* R = P0 + x^2*(P1 + x^2*(P2 + .... x^2*P12)); */ |
81 | vmovaps %zmm15, %zmm14 | |
82 | vfmadd231pd {rn-sae}, %zmm12, %zmm10, %zmm14 | |
83 | vfmadd231pd {rn-sae}, %zmm13, %zmm10, %zmm15 | |
84 | vmovups _gf_la_poly_1+__svml_serf_data_internal(%rip), %zmm10 | |
85 | vfmadd213pd {rn-sae}, %zmm9, %zmm12, %zmm14 | |
86 | vfmadd231pd {rn-sae}, %zmm13, %zmm15, %zmm9 | |
87 | vfmadd213pd {rn-sae}, %zmm7, %zmm12, %zmm14 | |
88 | vfmadd231pd {rn-sae}, %zmm13, %zmm9, %zmm7 | |
89 | vfmadd213pd {rn-sae}, %zmm0, %zmm12, %zmm14 | |
90 | vfmadd231pd {rn-sae}, %zmm13, %zmm7, %zmm0 | |
91 | vmovups _gf_MaxThreshold_LA+__svml_serf_data_internal(%rip), %zmm7 | |
92 | vfmadd213pd {rn-sae}, %zmm1, %zmm12, %zmm14 | |
93 | vfmadd231pd {rn-sae}, %zmm13, %zmm0, %zmm1 | |
94 | vmovups _gf_la_poly_0+__svml_serf_data_internal(%rip), %zmm0 | |
95 | vcmpps $22, {sae}, %zmm11, %zmm7, %k1 | |
96 | vfmadd213pd {rn-sae}, %zmm2, %zmm12, %zmm14 | |
97 | vfmadd231pd {rn-sae}, %zmm13, %zmm1, %zmm2 | |
98 | vfmadd213pd {rn-sae}, %zmm3, %zmm12, %zmm14 | |
99 | vfmadd231pd {rn-sae}, %zmm13, %zmm2, %zmm3 | |
100 | vfmadd213pd {rn-sae}, %zmm4, %zmm12, %zmm14 | |
101 | vfmadd231pd {rn-sae}, %zmm13, %zmm3, %zmm4 | |
102 | vfmadd213pd {rn-sae}, %zmm5, %zmm12, %zmm14 | |
103 | vfmadd231pd {rn-sae}, %zmm13, %zmm4, %zmm5 | |
104 | vfmadd213pd {rn-sae}, %zmm6, %zmm12, %zmm14 | |
105 | vfmadd231pd {rn-sae}, %zmm13, %zmm5, %zmm6 | |
106 | vmovups _AbsMask+__svml_serf_data_internal(%rip), %zmm5 | |
107 | vfmadd213pd {rn-sae}, %zmm10, %zmm12, %zmm14 | |
108 | vfmadd231pd {rn-sae}, %zmm13, %zmm6, %zmm10 | |
109 | vandnps %zmm8, %zmm5, %zmm6 | |
110 | vfmadd213pd {rn-sae}, %zmm0, %zmm14, %zmm12 | |
111 | vfmadd213pd {rn-sae}, %zmm0, %zmm10, %zmm13 | |
112 | vorps _One+__svml_serf_data_internal(%rip), %zmm6, %zmm0 | |
113 | vmulpd {rn-sae}, %zmm12, %zmm12, %zmm1 | |
114 | vmulpd {rn-sae}, %zmm13, %zmm13, %zmm3 | |
115 | vcvtpd2ps {rn-sae}, %zmm1, %ymm2 | |
116 | vcvtpd2ps {rn-sae}, %zmm3, %ymm4 | |
117 | vinsertf32x8 $1, %ymm4, %zmm2, %zmm9 | |
f9ce13fd | 118 | |
589a73ac SP |
119 | /* erf(x) = R * R * x; */ |
120 | vmulps {rn-sae}, %zmm8, %zmm9, %zmm0{%k1} | |
121 | ret | |
f9ce13fd SP |
122 | |
123 | END(_ZGVeN16v_erff_skx) | |
124 | ||
589a73ac SP |
125 | .section .rodata, "a" |
126 | .align 64 | |
f9ce13fd SP |
127 | |
128 | #ifdef __svml_serf_data_internal_typedef | |
129 | typedef unsigned int VUINT32; | |
589a73ac SP |
130 | typedef struct { |
131 | __declspec(align(64)) VUINT32 _AbsMask[16][1]; | |
132 | __declspec(align(64)) VUINT32 _One[16][1]; | |
133 | __declspec(align(64)) VUINT32 _gf_MaxThreshold_LA[16][1]; | |
134 | __declspec(align(64)) VUINT32 _gf_la_poly_0[8][2]; | |
135 | __declspec(align(64)) VUINT32 _gf_la_poly_1[8][2]; | |
136 | __declspec(align(64)) VUINT32 _gf_la_poly_2[8][2]; | |
137 | __declspec(align(64)) VUINT32 _gf_la_poly_3[8][2]; | |
138 | __declspec(align(64)) VUINT32 _gf_la_poly_4[8][2]; | |
139 | __declspec(align(64)) VUINT32 _gf_la_poly_5[8][2]; | |
140 | __declspec(align(64)) VUINT32 _gf_la_poly_6[8][2]; | |
141 | __declspec(align(64)) VUINT32 _gf_la_poly_7[8][2]; | |
142 | __declspec(align(64)) VUINT32 _gf_la_poly_8[8][2]; | |
143 | __declspec(align(64)) VUINT32 _gf_la_poly_9[8][2]; | |
144 | __declspec(align(64)) VUINT32 _gf_la_poly_10[8][2]; | |
145 | __declspec(align(64)) VUINT32 _gf_la_poly_11[8][2]; | |
146 | __declspec(align(64)) VUINT32 _gf_la_poly_12[8][2]; | |
f9ce13fd SP |
147 | } __svml_serf_data_internal; |
148 | #endif | |
149 | __svml_serf_data_internal: | |
589a73ac SP |
150 | .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _AbsMask */ |
151 | .align 64 | |
152 | .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 /* _One */ | |
153 | .align 64 | |
154 | .long 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a /* _gf_MaxThreshold_LA */ | |
155 | .align 64 | |
156 | .quad 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903 /* _gf_la_poly_0 */ | |
157 | .align 64 | |
158 | .quad 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367 /* _gf_la_poly_1 */ | |
159 | .align 64 | |
160 | .quad 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b /* _gf_la_poly_2 */ | |
161 | .align 64 | |
162 | .quad 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc /* _gf_la_poly_3 */ | |
163 | .align 64 | |
164 | .quad 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392 /* _gf_la_poly_4 */ | |
165 | .align 64 | |
166 | .quad 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede /* _gf_la_poly_5 */ | |
167 | .align 64 | |
168 | .quad 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0 /* _gf_la_poly_6 */ | |
169 | .align 64 | |
170 | .quad 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f /* _gf_la_poly_7 */ | |
171 | .align 64 | |
172 | .quad 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523 /* _gf_la_poly_8 */ | |
173 | .align 64 | |
174 | .quad 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47 /* _gf_la_poly_9 */ | |
175 | .align 64 | |
176 | .quad 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03 /* _gf_la_poly_10 */ | |
177 | .align 64 | |
178 | .quad 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb /* _gf_la_poly_11 */ | |
179 | .align 64 | |
180 | .quad 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1 /* _gf_la_poly_12 */ | |
181 | .align 64 | |
182 | .type __svml_serf_data_internal, @object | |
183 | .size __svml_serf_data_internal, .-__svml_serf_data_internal |