]>
Commit | Line | Data |
---|---|---|
74265c16 | 1 | /* Function log1pf vectorized with AVX2. |
6d7e8eda | 2 | Copyright (C) 2021-2023 Free Software Foundation, Inc. |
74265c16 SP |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | https://www.gnu.org/licenses/. */ | |
18 | ||
19 | /* | |
20 | * ALGORITHM DESCRIPTION: | |
21 | * | |
b44b6f42 | 22 | * 1+x = 2^k*(xh + xl) is computed in high-low parts; xh in [1, 2) |
74265c16 SP |
23 | * Get short reciprocal approximation Rcp ~ 1/xh |
24 | * R = (Rcp*xh - 1.0) + Rcp*xl | |
25 | * log1p(x) = k*log(2.0) - log(Rcp) + poly(R) | |
26 | * log(Rcp) is tabulated | |
27 | * | |
28 | * | |
29 | */ | |
30 | ||
31 | /* Offsets for data table __svml_slog1p_data_internal | |
32 | */ | |
b44b6f42 SP |
33 | #define SgnMask 0 |
34 | #define sOne 32 | |
35 | #define sPoly 64 | |
36 | #define iHiDelta 320 | |
37 | #define iLoRange 352 | |
38 | #define iBrkValue 384 | |
39 | #define iOffExpoMask 416 | |
40 | #define sLn2 448 | |
74265c16 SP |
41 | |
42 | #include <sysdep.h> | |
43 | ||
b44b6f42 | 44 | .section .text.avx2, "ax", @progbits |
74265c16 | 45 | ENTRY(_ZGVdN8v_log1pf_avx2) |
b44b6f42 SP |
46 | pushq %rbp |
47 | cfi_def_cfa_offset(16) | |
48 | movq %rsp, %rbp | |
49 | cfi_def_cfa(6, 16) | |
50 | cfi_offset(6, -16) | |
51 | andq $-32, %rsp | |
52 | subq $96, %rsp | |
53 | vmovups sOne+__svml_slog1p_data_internal(%rip), %ymm2 | |
54 | ||
55 | /* reduction: compute r, n */ | |
56 | vmovups iBrkValue+__svml_slog1p_data_internal(%rip), %ymm13 | |
57 | vmovups SgnMask+__svml_slog1p_data_internal(%rip), %ymm4 | |
58 | vmovups iLoRange+__svml_slog1p_data_internal(%rip), %ymm8 | |
59 | vmovaps %ymm0, %ymm3 | |
60 | ||
61 | /* compute 1+x as high, low parts */ | |
62 | vmaxps %ymm3, %ymm2, %ymm5 | |
63 | vminps %ymm3, %ymm2, %ymm6 | |
64 | vaddps %ymm6, %ymm5, %ymm10 | |
65 | vpsubd %ymm13, %ymm10, %ymm11 | |
66 | ||
67 | /* check argument value ranges */ | |
68 | vpaddd iHiDelta+__svml_slog1p_data_internal(%rip), %ymm10, %ymm9 | |
69 | vsubps %ymm10, %ymm5, %ymm7 | |
70 | vpsrad $23, %ymm11, %ymm14 | |
71 | vpand iOffExpoMask+__svml_slog1p_data_internal(%rip), %ymm11, %ymm12 | |
72 | vpslld $23, %ymm14, %ymm15 | |
73 | vcvtdq2ps %ymm14, %ymm0 | |
74 | vpsubd %ymm15, %ymm2, %ymm14 | |
75 | vandnps %ymm3, %ymm4, %ymm1 | |
76 | vaddps %ymm7, %ymm6, %ymm4 | |
77 | vpaddd %ymm13, %ymm12, %ymm6 | |
78 | vmulps %ymm4, %ymm14, %ymm7 | |
79 | ||
80 | /* polynomial evaluation */ | |
81 | vsubps %ymm2, %ymm6, %ymm2 | |
82 | vpcmpgtd %ymm9, %ymm8, %ymm5 | |
83 | vmovups sPoly+224+__svml_slog1p_data_internal(%rip), %ymm8 | |
84 | vaddps %ymm2, %ymm7, %ymm9 | |
85 | vfmadd213ps sPoly+192+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8 | |
86 | vfmadd213ps sPoly+160+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8 | |
87 | vfmadd213ps sPoly+128+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8 | |
88 | vfmadd213ps sPoly+96+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8 | |
89 | vfmadd213ps sPoly+64+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8 | |
90 | vfmadd213ps sPoly+32+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8 | |
91 | vfmadd213ps sPoly+__svml_slog1p_data_internal(%rip), %ymm9, %ymm8 | |
92 | vmulps %ymm8, %ymm9, %ymm10 | |
93 | vfmadd213ps %ymm9, %ymm9, %ymm10 | |
94 | ||
95 | /* final reconstruction */ | |
96 | vfmadd132ps sLn2+__svml_slog1p_data_internal(%rip), %ymm10, %ymm0 | |
97 | ||
98 | /* combine and get argument value range mask */ | |
99 | vmovmskps %ymm5, %edx | |
100 | vorps %ymm1, %ymm0, %ymm0 | |
101 | testl %edx, %edx | |
102 | ||
103 | /* Go to special inputs processing branch */ | |
104 | jne L(SPECIAL_VALUES_BRANCH) | |
105 | # LOE rbx r12 r13 r14 r15 edx ymm0 ymm3 | |
106 | ||
107 | /* Restore registers | |
108 | * and exit the function | |
109 | */ | |
74265c16 SP |
110 | |
111 | L(EXIT): | |
b44b6f42 SP |
112 | movq %rbp, %rsp |
113 | popq %rbp | |
114 | cfi_def_cfa(7, 8) | |
115 | cfi_restore(6) | |
116 | ret | |
117 | cfi_def_cfa(6, 16) | |
118 | cfi_offset(6, -16) | |
119 | ||
120 | /* Branch to process | |
121 | * special inputs | |
122 | */ | |
74265c16 SP |
123 | |
124 | L(SPECIAL_VALUES_BRANCH): | |
b44b6f42 SP |
125 | vmovups %ymm3, 32(%rsp) |
126 | vmovups %ymm0, 64(%rsp) | |
127 | # LOE rbx r12 r13 r14 r15 edx ymm0 | |
128 | ||
129 | xorl %eax, %eax | |
130 | # LOE rbx r12 r13 r14 r15 eax edx | |
131 | ||
132 | vzeroupper | |
133 | movq %r12, 16(%rsp) | |
134 | /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ | |
135 | .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 | |
136 | movl %eax, %r12d | |
137 | movq %r13, 8(%rsp) | |
138 | /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ | |
139 | .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 | |
140 | movl %edx, %r13d | |
141 | movq %r14, (%rsp) | |
142 | /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ | |
143 | .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 | |
144 | # LOE rbx r15 r12d r13d | |
145 | ||
146 | /* Range mask | |
147 | * bits check | |
148 | */ | |
74265c16 SP |
149 | |
150 | L(RANGEMASK_CHECK): | |
b44b6f42 | 151 | btl %r12d, %r13d |
74265c16 | 152 | |
b44b6f42 SP |
153 | /* Call scalar math function */ |
154 | jc L(SCALAR_MATH_CALL) | |
155 | # LOE rbx r15 r12d r13d | |
74265c16 | 156 | |
b44b6f42 SP |
157 | /* Special inputs |
158 | * processing loop | |
159 | */ | |
74265c16 SP |
160 | |
161 | L(SPECIAL_VALUES_LOOP): | |
b44b6f42 SP |
162 | incl %r12d |
163 | cmpl $8, %r12d | |
164 | ||
165 | /* Check bits in range mask */ | |
166 | jl L(RANGEMASK_CHECK) | |
167 | # LOE rbx r15 r12d r13d | |
168 | ||
169 | movq 16(%rsp), %r12 | |
170 | cfi_restore(12) | |
171 | movq 8(%rsp), %r13 | |
172 | cfi_restore(13) | |
173 | movq (%rsp), %r14 | |
174 | cfi_restore(14) | |
175 | vmovups 64(%rsp), %ymm0 | |
176 | ||
177 | /* Go to exit */ | |
178 | jmp L(EXIT) | |
179 | /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ | |
180 | .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 | |
181 | /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ | |
182 | .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 | |
183 | /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ | |
184 | .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 | |
185 | # LOE rbx r12 r13 r14 r15 ymm0 | |
186 | ||
187 | /* Scalar math fucntion call | |
188 | * to process special input | |
189 | */ | |
74265c16 SP |
190 | |
191 | L(SCALAR_MATH_CALL): | |
b44b6f42 | 192 | movl %r12d, %r14d |
3079f652 | 193 | vmovss 32(%rsp, %r14, 4), %xmm0 |
b44b6f42 SP |
194 | call log1pf@PLT |
195 | # LOE rbx r14 r15 r12d r13d xmm0 | |
74265c16 | 196 | |
3079f652 | 197 | vmovss %xmm0, 64(%rsp, %r14, 4) |
74265c16 | 198 | |
b44b6f42 SP |
199 | /* Process special inputs in loop */ |
200 | jmp L(SPECIAL_VALUES_LOOP) | |
201 | # LOE rbx r15 r12d r13d | |
74265c16 SP |
202 | END(_ZGVdN8v_log1pf_avx2) |
203 | ||
b44b6f42 SP |
204 | .section .rodata, "a" |
205 | .align 32 | |
74265c16 SP |
206 | |
207 | #ifdef __svml_slog1p_data_internal_typedef | |
208 | typedef unsigned int VUINT32; | |
209 | typedef struct { | |
b44b6f42 SP |
210 | __declspec(align(32)) VUINT32 SgnMask[8][1]; |
211 | __declspec(align(32)) VUINT32 sOne[8][1]; | |
212 | __declspec(align(32)) VUINT32 sPoly[8][8][1]; | |
213 | __declspec(align(32)) VUINT32 iHiDelta[8][1]; | |
214 | __declspec(align(32)) VUINT32 iLoRange[8][1]; | |
215 | __declspec(align(32)) VUINT32 iBrkValue[8][1]; | |
216 | __declspec(align(32)) VUINT32 iOffExpoMask[8][1]; | |
217 | __declspec(align(32)) VUINT32 sLn2[8][1]; | |
74265c16 SP |
218 | } __svml_slog1p_data_internal; |
219 | #endif | |
220 | __svml_slog1p_data_internal: | |
b44b6f42 SP |
221 | /* SgnMask */ |
222 | .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff | |
223 | /* sOne = SP 1.0 */ | |
224 | .align 32 | |
225 | .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 | |
226 | /* sPoly[] = SP polynomial */ | |
227 | .align 32 | |
228 | .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */ | |
229 | .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */ | |
230 | .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */ | |
231 | .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */ | |
232 | .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */ | |
233 | .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */ | |
234 | .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */ | |
235 | .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */ | |
236 | /* iHiDelta = SP 80000000-7f000000 */ | |
237 | .align 32 | |
238 | .long 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000 | |
239 | /* iLoRange = SP 00800000+iHiDelta */ | |
240 | .align 32 | |
241 | .long 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000 | |
242 | /* iBrkValue = SP 2/3 */ | |
243 | .align 32 | |
244 | .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab | |
245 | /* iOffExpoMask = SP significand mask */ | |
246 | .align 32 | |
247 | .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff | |
248 | /* sLn2 = SP ln(2) */ | |
249 | .align 32 | |
250 | .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 | |
251 | .align 32 | |
252 | .type __svml_slog1p_data_internal, @object | |
253 | .size __svml_slog1p_data_internal, .-__svml_slog1p_data_internal |