]>
Commit | Line | Data |
---|---|---|
37475ba8 | 1 | /* Function hypotf vectorized with AVX-512. |
6d7e8eda | 2 | Copyright (C) 2021-2023 Free Software Foundation, Inc. |
37475ba8 SP |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | https://www.gnu.org/licenses/. */ | |
18 | ||
19 | /* | |
20 | * ALGORITHM DESCRIPTION: | |
21 | * | |
22 | * HIGH LEVEL OVERVIEW | |
23 | * | |
24 | * Calculate z = (x*x+y*y) | |
25 | * Calculate reciplicle sqrt (z) | |
26 | * Calculate make two NR iterations | |
27 | * | |
28 | * ALGORITHM DETAILS | |
29 | * | |
30 | * Multiprecision branch for _HA_ only | |
31 | * Remove sigm from both arguments | |
32 | * Find maximum (_x) and minimum (_y) (by abs value) between arguments | |
33 | * Split _x int _a and _b for multiprecision | |
34 | * If _x >> _y we will we will not split _y for multiprecision | |
35 | * all _y will be put into lower part (_d) and higher part (_c = 0) | |
36 | * Fixing _hilo_mask for the case _x >> _y | |
37 | * Split _y into _c and _d for multiprecision with fixed mask | |
38 | * | |
39 | * compute Hi and Lo parts of _z = _x*_x + _y*_y | |
40 | * | |
41 | * _zHi = _a*_a + _c*_c | |
42 | * _zLo = (_x + _a)*_b + _d*_y + _d*_c | |
43 | * _z = _zHi + _zLo | |
44 | * | |
45 | * No multiprecision branch for _LA_ and _EP_ | |
46 | * _z = _VARG1 * _VARG1 + _VARG2 * _VARG2 | |
47 | * | |
48 | * Check _z exponent to be withing borders [1E3 ; 60A] else goto Callout | |
49 | * | |
50 | * Compute resciplicle sqrt s0 ~ 1.0/sqrt(_z), | |
51 | * that multiplied by _z, is final result for _EP_ version. | |
52 | * | |
53 | * First iteration (or zero iteration): | |
54 | * s = z * s0 | |
55 | * h = .5 * s0 | |
56 | * d = s * h - .5 | |
57 | * | |
58 | * Second iteration: | |
59 | * h = d * h + h | |
60 | * s = s * d + s | |
61 | * d = s * s - z (in multiprecision for _HA_) | |
62 | * | |
63 | * result = s - h * d | |
64 | * | |
65 | * EP version of the function can be implemented as y[i]=sqrt(a[i]^2+b[i]^2) | |
075dd8a0 | 66 | * with all intermediate operations done in target precision for i=1, .., n. |
37475ba8 SP |
67 | * It can return result y[i]=0 in case a[i]^2 and b[i]^2 underflow in target |
68 | * precision (for some i). It can return result y[i]=NAN in case | |
69 | * a[i]^2+b[i]^2 overflow in target precision, for some i. It can return | |
70 | * result y[i]=NAN in case a[i] or b[i] is infinite, for some i. | |
71 | * | |
72 | * | |
73 | */ | |
74 | ||
75 | /* Offsets for data table __svml_shypot_data_internal | |
76 | */ | |
075dd8a0 SP |
77 | #define _sAbsMask 0 |
78 | #define _sHalf 64 | |
79 | #define _iExpBound 128 | |
37475ba8 SP |
80 | |
81 | #include <sysdep.h> | |
82 | ||
95177b78 | 83 | .section .text.evex512, "ax", @progbits |
37475ba8 | 84 | ENTRY(_ZGVeN16vv_hypotf_skx) |
075dd8a0 SP |
85 | pushq %rbp |
86 | cfi_def_cfa_offset(16) | |
87 | movq %rsp, %rbp | |
88 | cfi_def_cfa(6, 16) | |
89 | cfi_offset(6, -16) | |
90 | andq $-64, %rsp | |
91 | subq $256, %rsp | |
92 | vgetexpps {sae}, %zmm0, %zmm2 | |
93 | vgetexpps {sae}, %zmm1, %zmm3 | |
94 | vmovups _sHalf+__svml_shypot_data_internal(%rip), %zmm6 | |
95 | vmaxps {sae}, %zmm3, %zmm2, %zmm4 | |
96 | vmulps {rn-sae}, %zmm0, %zmm0, %zmm2 | |
97 | vandps _sAbsMask+__svml_shypot_data_internal(%rip), %zmm4, %zmm5 | |
98 | vfmadd231ps {rn-sae}, %zmm1, %zmm1, %zmm2 | |
99 | vpcmpd $5, _iExpBound+__svml_shypot_data_internal(%rip), %zmm5, %k0 | |
100 | vrsqrt14ps %zmm2, %zmm7 | |
101 | kmovw %k0, %edx | |
102 | vmulps {rn-sae}, %zmm7, %zmm2, %zmm9 | |
103 | vmulps {rn-sae}, %zmm7, %zmm6, %zmm8 | |
104 | vfnmadd231ps {rn-sae}, %zmm9, %zmm9, %zmm2 | |
105 | vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm2 | |
106 | ||
107 | /* | |
108 | * VSCALEF( S, _VRES1, _VRES1, sExp ); | |
109 | * The end of implementation | |
110 | */ | |
111 | testl %edx, %edx | |
112 | ||
113 | /* Go to special inputs processing branch */ | |
114 | jne L(SPECIAL_VALUES_BRANCH) | |
115 | # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1 zmm2 | |
116 | ||
117 | /* Restore registers | |
118 | * and exit the function | |
119 | */ | |
37475ba8 SP |
120 | |
121 | L(EXIT): | |
075dd8a0 SP |
122 | vmovaps %zmm2, %zmm0 |
123 | movq %rbp, %rsp | |
124 | popq %rbp | |
125 | cfi_def_cfa(7, 8) | |
126 | cfi_restore(6) | |
127 | ret | |
128 | cfi_def_cfa(6, 16) | |
129 | cfi_offset(6, -16) | |
130 | ||
131 | /* Branch to process | |
132 | * special inputs | |
133 | */ | |
37475ba8 SP |
134 | |
135 | L(SPECIAL_VALUES_BRANCH): | |
075dd8a0 SP |
136 | vmovups %zmm0, 64(%rsp) |
137 | vmovups %zmm1, 128(%rsp) | |
138 | vmovups %zmm2, 192(%rsp) | |
139 | # LOE rbx r12 r13 r14 r15 edx zmm2 | |
140 | ||
141 | xorl %eax, %eax | |
142 | # LOE rbx r12 r13 r14 r15 eax edx | |
143 | ||
144 | vzeroupper | |
145 | movq %r12, 16(%rsp) | |
146 | /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -240; DW_OP_plus) */ | |
147 | .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x10, 0xff, 0xff, 0xff, 0x22 | |
148 | movl %eax, %r12d | |
149 | movq %r13, 8(%rsp) | |
150 | /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -248; DW_OP_plus) */ | |
151 | .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x08, 0xff, 0xff, 0xff, 0x22 | |
152 | movl %edx, %r13d | |
153 | movq %r14, (%rsp) | |
154 | /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -256; DW_OP_plus) */ | |
155 | .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x00, 0xff, 0xff, 0xff, 0x22 | |
156 | # LOE rbx r15 r12d r13d | |
157 | ||
158 | /* Range mask | |
159 | * bits check | |
160 | */ | |
37475ba8 SP |
161 | |
162 | L(RANGEMASK_CHECK): | |
075dd8a0 | 163 | btl %r12d, %r13d |
37475ba8 | 164 | |
075dd8a0 SP |
165 | /* Call scalar math function */ |
166 | jc L(SCALAR_MATH_CALL) | |
167 | # LOE rbx r15 r12d r13d | |
37475ba8 | 168 | |
075dd8a0 SP |
169 | /* Special inputs |
170 | * processing loop | |
171 | */ | |
37475ba8 SP |
172 | |
173 | L(SPECIAL_VALUES_LOOP): | |
075dd8a0 SP |
174 | incl %r12d |
175 | cmpl $16, %r12d | |
176 | ||
177 | /* Check bits in range mask */ | |
178 | jl L(RANGEMASK_CHECK) | |
179 | # LOE rbx r15 r12d r13d | |
180 | ||
181 | movq 16(%rsp), %r12 | |
182 | cfi_restore(12) | |
183 | movq 8(%rsp), %r13 | |
184 | cfi_restore(13) | |
185 | movq (%rsp), %r14 | |
186 | cfi_restore(14) | |
187 | vmovups 192(%rsp), %zmm2 | |
188 | ||
189 | /* Go to exit */ | |
190 | jmp L(EXIT) | |
191 | /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -240; DW_OP_plus) */ | |
192 | .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x10, 0xff, 0xff, 0xff, 0x22 | |
193 | /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -248; DW_OP_plus) */ | |
194 | .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x08, 0xff, 0xff, 0xff, 0x22 | |
195 | /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -256; DW_OP_plus) */ | |
196 | .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x00, 0xff, 0xff, 0xff, 0x22 | |
197 | # LOE rbx r12 r13 r14 r15 zmm2 | |
198 | ||
199 | /* Scalar math fucntion call | |
200 | * to process special input | |
201 | */ | |
37475ba8 SP |
202 | |
203 | L(SCALAR_MATH_CALL): | |
075dd8a0 | 204 | movl %r12d, %r14d |
3079f652 NG |
205 | vmovss 64(%rsp, %r14, 4), %xmm0 |
206 | vmovss 128(%rsp, %r14, 4), %xmm1 | |
075dd8a0 SP |
207 | call hypotf@PLT |
208 | # LOE rbx r14 r15 r12d r13d xmm0 | |
37475ba8 | 209 | |
3079f652 | 210 | vmovss %xmm0, 192(%rsp, %r14, 4) |
37475ba8 | 211 | |
075dd8a0 SP |
212 | /* Process special inputs in loop */ |
213 | jmp L(SPECIAL_VALUES_LOOP) | |
214 | # LOE rbx r15 r12d r13d | |
37475ba8 SP |
215 | END(_ZGVeN16vv_hypotf_skx) |
216 | ||
075dd8a0 SP |
217 | .section .rodata, "a" |
218 | .align 64 | |
37475ba8 SP |
219 | |
220 | #ifdef __svml_shypot_data_internal_typedef | |
221 | typedef unsigned int VUINT32; | |
075dd8a0 SP |
222 | typedef struct { |
223 | __declspec(align(64)) VUINT32 _sAbsMask[16][1]; | |
224 | __declspec(align(64)) VUINT32 _sHalf[16][1]; | |
225 | __declspec(align(64)) VUINT32 _iExpBound[16][1]; | |
37475ba8 SP |
226 | } __svml_shypot_data_internal; |
227 | #endif | |
228 | __svml_shypot_data_internal: | |
075dd8a0 SP |
229 | .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */ |
230 | .align 64 | |
231 | .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sHalf */ | |
232 | /* fma based algorithm*/ | |
233 | .align 64 | |
234 | .long 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000 /* _iExpBound */ | |
235 | .align 64 | |
236 | .type __svml_shypot_data_internal, @object | |
237 | .size __svml_shypot_data_internal, .-__svml_shypot_data_internal |