]>
Commit | Line | Data |
---|---|---|
37475ba8 | 1 | /* Function hypotf vectorized with AVX2. |
dff8da6b | 2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
37475ba8 SP |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | https://www.gnu.org/licenses/. */ | |
18 | ||
19 | /* | |
20 | * ALGORITHM DESCRIPTION: | |
21 | * | |
22 | * HIGH LEVEL OVERVIEW | |
23 | * | |
24 | * Calculate z = (x*x+y*y) | |
25 | * Calculate reciplicle sqrt (z) | |
26 | * Calculate make two NR iterations | |
27 | * | |
28 | * ALGORITHM DETAILS | |
29 | * | |
30 | * Multiprecision branch for _HA_ only | |
31 | * Remove sigm from both arguments | |
32 | * Find maximum (_x) and minimum (_y) (by abs value) between arguments | |
33 | * Split _x int _a and _b for multiprecision | |
34 | * If _x >> _y we will we will not split _y for multiprecision | |
35 | * all _y will be put into lower part (_d) and higher part (_c = 0) | |
36 | * Fixing _hilo_mask for the case _x >> _y | |
37 | * Split _y into _c and _d for multiprecision with fixed mask | |
38 | * | |
39 | * compute Hi and Lo parts of _z = _x*_x + _y*_y | |
40 | * | |
41 | * _zHi = _a*_a + _c*_c | |
42 | * _zLo = (_x + _a)*_b + _d*_y + _d*_c | |
43 | * _z = _zHi + _zLo | |
44 | * | |
45 | * No multiprecision branch for _LA_ and _EP_ | |
46 | * _z = _VARG1 * _VARG1 + _VARG2 * _VARG2 | |
47 | * | |
1d2971b5 | 48 | * Check _z exponent to be within borders [1E3 ; 60A] else goto Callout |
37475ba8 SP |
49 | * |
50 | * Compute resciplicle sqrt s0 ~ 1.0/sqrt(_z), | |
51 | * that multiplied by _z, is final result for _EP_ version. | |
52 | * | |
53 | * First iteration (or zero iteration): | |
54 | * s = z * s0 | |
55 | * h = .5 * s0 | |
56 | * d = s * h - .5 | |
57 | * | |
58 | * Second iteration: | |
59 | * h = d * h + h | |
60 | * s = s * d + s | |
61 | * d = s * s - z (in multiprecision for _HA_) | |
62 | * | |
63 | * result = s - h * d | |
64 | * | |
65 | * EP version of the function can be implemented as y[i]=sqrt(a[i]^2+b[i]^2) | |
a8e60c7e | 66 | * with all intermediate operations done in target precision for i=1, .., n. |
37475ba8 SP |
67 | * It can return result y[i]=0 in case a[i]^2 and b[i]^2 underflow in target |
68 | * precision (for some i). It can return result y[i]=NAN in case | |
69 | * a[i]^2+b[i]^2 overflow in target precision, for some i. It can return | |
70 | * result y[i]=NAN in case a[i] or b[i] is infinite, for some i. | |
71 | * | |
72 | * | |
73 | */ | |
74 | ||
75 | /* Offsets for data table __svml_shypot_data_internal | |
76 | */ | |
a8e60c7e SP |
77 | #define _sHiLoMask 0 |
78 | #define _sAbsMask 32 | |
79 | #define _sHalf 64 | |
80 | #define _LowBoundary 96 | |
81 | #define _HighBoundary 128 | |
37475ba8 SP |
82 | |
83 | #include <sysdep.h> | |
84 | ||
a8e60c7e | 85 | .section .text.avx2, "ax", @progbits |
37475ba8 | 86 | ENTRY(_ZGVdN8vv_hypotf_avx2) |
a8e60c7e SP |
87 | pushq %rbp |
88 | cfi_def_cfa_offset(16) | |
89 | movq %rsp, %rbp | |
90 | cfi_def_cfa(6, 16) | |
91 | cfi_offset(6, -16) | |
92 | andq $-32, %rsp | |
93 | subq $128, %rsp | |
94 | ||
95 | /* | |
96 | * Implementation | |
97 | * Multiprecision branch for _HA_ only | |
98 | * No multiprecision branch for _LA_ | |
99 | * _z = _VARG1 * _VARG1 + _VARG2 * _VARG2 | |
100 | */ | |
101 | vmulps %ymm0, %ymm0, %ymm8 | |
102 | ||
103 | /* | |
104 | * Variables | |
105 | * Defines | |
106 | * Constants loading | |
107 | */ | |
108 | vmovups _sHalf+__svml_shypot_data_internal(%rip), %ymm7 | |
109 | ||
1d2971b5 | 110 | /* Check _z exponent to be within borders [1E3 ; 60A] else goto Callout */ |
a8e60c7e SP |
111 | vmovups _LowBoundary+__svml_shypot_data_internal(%rip), %ymm2 |
112 | vfmadd231ps %ymm1, %ymm1, %ymm8 | |
113 | ||
114 | /* _s0 ~ 1.0/sqrt(_z) */ | |
115 | vrsqrtps %ymm8, %ymm6 | |
116 | vpcmpgtd %ymm8, %ymm2, %ymm3 | |
117 | ||
118 | /* First iteration */ | |
119 | vmulps %ymm8, %ymm6, %ymm9 | |
120 | vmulps %ymm7, %ymm6, %ymm2 | |
121 | vfnmadd231ps %ymm9, %ymm2, %ymm7 | |
122 | vfmadd213ps %ymm9, %ymm7, %ymm9 | |
123 | ||
124 | /* Second iteration */ | |
125 | vfmadd132ps %ymm7, %ymm2, %ymm2 | |
126 | vpcmpgtd _HighBoundary+__svml_shypot_data_internal(%rip), %ymm8, %ymm4 | |
127 | vpor %ymm4, %ymm3, %ymm5 | |
128 | ||
129 | /* Finish second iteration in native precision for _LA_ */ | |
130 | vfmsub231ps %ymm9, %ymm9, %ymm8 | |
131 | vmovmskps %ymm5, %edx | |
132 | vfnmadd213ps %ymm9, %ymm8, %ymm2 | |
133 | ||
134 | /* The end of implementation */ | |
135 | testl %edx, %edx | |
136 | ||
137 | /* Go to special inputs processing branch */ | |
138 | jne L(SPECIAL_VALUES_BRANCH) | |
139 | # LOE rbx r12 r13 r14 r15 edx ymm0 ymm1 ymm2 | |
140 | ||
141 | /* Restore registers | |
142 | * and exit the function | |
143 | */ | |
37475ba8 SP |
144 | |
145 | L(EXIT): | |
a8e60c7e SP |
146 | vmovaps %ymm2, %ymm0 |
147 | movq %rbp, %rsp | |
148 | popq %rbp | |
149 | cfi_def_cfa(7, 8) | |
150 | cfi_restore(6) | |
151 | ret | |
152 | cfi_def_cfa(6, 16) | |
153 | cfi_offset(6, -16) | |
154 | ||
155 | /* Branch to process | |
156 | * special inputs | |
157 | */ | |
37475ba8 SP |
158 | |
159 | L(SPECIAL_VALUES_BRANCH): | |
a8e60c7e SP |
160 | vmovups %ymm0, 32(%rsp) |
161 | vmovups %ymm1, 64(%rsp) | |
162 | vmovups %ymm2, 96(%rsp) | |
163 | # LOE rbx r12 r13 r14 r15 edx ymm2 | |
164 | ||
165 | xorl %eax, %eax | |
166 | # LOE rbx r12 r13 r14 r15 eax edx | |
167 | ||
168 | vzeroupper | |
169 | movq %r12, 16(%rsp) | |
170 | /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -112; DW_OP_plus) */ | |
171 | .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xff, 0xff, 0xff, 0x22 | |
172 | movl %eax, %r12d | |
173 | movq %r13, 8(%rsp) | |
174 | /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus) */ | |
175 | .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22 | |
176 | movl %edx, %r13d | |
177 | movq %r14, (%rsp) | |
178 | /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus) */ | |
179 | .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22 | |
180 | # LOE rbx r15 r12d r13d | |
181 | ||
182 | /* Range mask | |
183 | * bits check | |
184 | */ | |
37475ba8 SP |
185 | |
186 | L(RANGEMASK_CHECK): | |
a8e60c7e | 187 | btl %r12d, %r13d |
37475ba8 | 188 | |
a8e60c7e SP |
189 | /* Call scalar math function */ |
190 | jc L(SCALAR_MATH_CALL) | |
191 | # LOE rbx r15 r12d r13d | |
37475ba8 | 192 | |
a8e60c7e SP |
193 | /* Special inputs |
194 | * processing loop | |
195 | */ | |
37475ba8 SP |
196 | |
197 | L(SPECIAL_VALUES_LOOP): | |
a8e60c7e SP |
198 | incl %r12d |
199 | cmpl $8, %r12d | |
200 | ||
201 | /* Check bits in range mask */ | |
202 | jl L(RANGEMASK_CHECK) | |
203 | # LOE rbx r15 r12d r13d | |
204 | ||
205 | movq 16(%rsp), %r12 | |
206 | cfi_restore(12) | |
207 | movq 8(%rsp), %r13 | |
208 | cfi_restore(13) | |
209 | movq (%rsp), %r14 | |
210 | cfi_restore(14) | |
211 | vmovups 96(%rsp), %ymm2 | |
212 | ||
213 | /* Go to exit */ | |
214 | jmp L(EXIT) | |
215 | /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -112; DW_OP_plus) */ | |
216 | .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xff, 0xff, 0xff, 0x22 | |
217 | /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus) */ | |
218 | .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22 | |
219 | /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus) */ | |
220 | .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22 | |
221 | # LOE rbx r12 r13 r14 r15 ymm2 | |
222 | ||
1d2971b5 | 223 | /* Scalar math function call |
a8e60c7e SP |
224 | * to process special input |
225 | */ | |
37475ba8 SP |
226 | |
227 | L(SCALAR_MATH_CALL): | |
a8e60c7e | 228 | movl %r12d, %r14d |
3079f652 NG |
229 | vmovss 32(%rsp, %r14, 4), %xmm0 |
230 | vmovss 64(%rsp, %r14, 4), %xmm1 | |
a8e60c7e SP |
231 | call hypotf@PLT |
232 | # LOE rbx r14 r15 r12d r13d xmm0 | |
37475ba8 | 233 | |
3079f652 | 234 | vmovss %xmm0, 96(%rsp, %r14, 4) |
37475ba8 | 235 | |
a8e60c7e SP |
236 | /* Process special inputs in loop */ |
237 | jmp L(SPECIAL_VALUES_LOOP) | |
238 | # LOE rbx r15 r12d r13d | |
37475ba8 SP |
239 | END(_ZGVdN8vv_hypotf_avx2) |
240 | ||
a8e60c7e SP |
241 | .section .rodata, "a" |
242 | .align 32 | |
37475ba8 SP |
243 | |
244 | #ifdef __svml_shypot_data_internal_typedef | |
245 | typedef unsigned int VUINT32; | |
a8e60c7e SP |
246 | typedef struct { |
247 | __declspec(align(32)) VUINT32 _sHiLoMask[8][1]; | |
248 | __declspec(align(32)) VUINT32 _sAbsMask[8][1]; | |
249 | __declspec(align(32)) VUINT32 _sHalf[8][1]; | |
250 | __declspec(align(32)) VUINT32 _LowBoundary[8][1]; | |
251 | __declspec(align(32)) VUINT32 _HighBoundary[8][1]; | |
37475ba8 SP |
252 | } __svml_shypot_data_internal; |
253 | #endif | |
254 | __svml_shypot_data_internal: | |
a8e60c7e SP |
255 | /* legacy algorithm */ |
256 | .long 0xFFF80000, 0xFFF80000, 0xFFF80000, 0xFFF80000, 0xFFF80000, 0xFFF80000, 0xFFF80000, 0xFFF80000 /* _sHiLoMask */ | |
257 | .align 32 | |
258 | .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */ | |
259 | .align 32 | |
260 | .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sHalf */ | |
261 | .align 32 | |
262 | .long 0x1E300000, 0x1E300000, 0x1E300000, 0x1E300000, 0x1E300000, 0x1E300000, 0x1E300000, 0x1E300000 /* _LowBoundary */ | |
263 | .align 32 | |
264 | .long 0x60A00000, 0x60A00000, 0x60A00000, 0x60A00000, 0x60A00000, 0x60A00000, 0x60A00000, 0x60A00000 /* _HighBoundary */ | |
265 | .align 32 | |
266 | .type __svml_shypot_data_internal, @object | |
267 | .size __svml_shypot_data_internal, .-__svml_shypot_data_internal |