]>
Commit | Line | Data |
---|---|---|
d5efd131 MF |
1 | .file "sqrt.s" |
2 | ||
3 | ||
4 | // Copyright (c) 2000 - 2003, Intel Corporation | |
5 | // All rights reserved. | |
0347518d | 6 | // |
d5efd131 | 7 | // Contributed 2000 by the Intel Numerics Group, Intel Corporation |
0347518d | 8 | // |
d5efd131 MF |
9 | // Redistribution and use in source and binary forms, with or without |
10 | // modification, are permitted provided that the following conditions are | |
11 | // met: | |
12 | // | |
13 | // * Redistributions of source code must retain the above copyright | |
14 | // notice, this list of conditions and the following disclaimer. | |
15 | // | |
16 | // * Redistributions in binary form must reproduce the above copyright | |
17 | // notice, this list of conditions and the following disclaimer in the | |
18 | // documentation and/or other materials provided with the distribution. | |
19 | // | |
20 | // * The name of Intel Corporation may not be used to endorse or promote | |
21 | // products derived from this software without specific prior written | |
22 | // permission. | |
23 | ||
0347518d MF |
24 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
25 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
d5efd131 | 26 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
0347518d | 27 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS |
d5efd131 | 28 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
0347518d MF |
29 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
30 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
31 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
d5efd131 | 32 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING |
0347518d MF |
33 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
34 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
35 | // | |
d5efd131 | 36 | // Intel Corporation is the author of this code, and requests that all |
0347518d | 37 | // problem reports or change requests be submitted to it directly at |
d5efd131 MF |
38 | // http://www.intel.com/software/products/opensource/libraries/num.htm. |
39 | // | |
40 | //******************************************************************** | |
41 | // History | |
42 | //******************************************************************** | |
43 | // 02/02/00 Initial version | |
44 | // 04/04/00 Unwind support added | |
45 | // 08/15/00 Bundle added after call to __libm_error_support to properly | |
46 | // set [the previously overwritten] GR_Parameter_RESULT. | |
47 | // 02/10/03 Reordered header: .section, .global, .proc, .align | |
48 | // | |
49 | //******************************************************************** | |
50 | // | |
51 | // Function: Combined sqrt(x), where | |
52 | // _ | |
53 | // sqrt(x) = |x, for double precision x values | |
54 | // | |
55 | //******************************************************************** | |
56 | // | |
57 | // Accuracy: Correctly Rounded | |
58 | // | |
59 | //******************************************************************** | |
60 | // | |
61 | // Resources Used: | |
62 | // | |
63 | // Floating-Point Registers: f8 (Input and Return Value) | |
64 | // f7 -f14 | |
65 | // | |
66 | // General Purpose Registers: | |
67 | // r32-r36 (Locals) | |
68 | // r37-r40 (Used to pass arguments to error handling routine) | |
69 | // | |
70 | // Predicate Registers: p6, p7, p8 | |
71 | // | |
72 | //********************************************************************* | |
73 | // | |
74 | // IEEE Special Conditions: | |
75 | // | |
76 | // All faults and exceptions should be raised correctly. | |
77 | // sqrt(QNaN) = QNaN | |
78 | // sqrt(SNaN) = QNaN | |
79 | // sqrt(+/-0) = +/-0 | |
80 | // sqrt(negative) = QNaN and error handling is called | |
81 | // | |
82 | //********************************************************************* | |
83 | // | |
84 | // Implementation: | |
85 | // | |
86 | // Modified Newton-Raphson Algorithm | |
87 | // | |
88 | //********************************************************************* | |
89 | ||
90 | GR_SAVE_PFS = r33 | |
91 | GR_SAVE_B0 = r34 | |
92 | GR_SAVE_GP = r35 | |
93 | ||
94 | GR_Parameter_X = r37 | |
95 | GR_Parameter_Y = r38 | |
96 | GR_Parameter_RESULT = r39 | |
97 | ||
98 | ||
99 | .section .text | |
100 | GLOBAL_IEEE754_ENTRY(sqrt) | |
101 | { .mfi | |
102 | alloc r32= ar.pfs,0,5,4,0 | |
103 | frsqrta.s0 f7,p6=f8 | |
104 | nop.i 0 | |
105 | } { .mlx | |
106 | // BEGIN DOUBLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM | |
107 | nop.m 0 | |
108 | // exponent of +1/2 in r2 | |
109 | movl r2 = 0x0fffe;; | |
110 | } { .mmi | |
111 | // +1/2 in f9 | |
112 | setf.exp f9 = r2 | |
113 | nop.m 0 | |
114 | nop.i 0 | |
115 | } { .mlx | |
116 | nop.m 0 | |
117 | // 3/2 in r3 | |
118 | movl r3=0x3fc00000;; | |
119 | } { .mfi | |
120 | setf.s f10=r3 | |
121 | // Step (1) | |
122 | // y0 = 1/sqrt(a) in f7 | |
0347518d | 123 | fclass.m.unc p7,p8 = f8,0x3A |
d5efd131 MF |
124 | nop.i 0;; |
125 | } { .mlx | |
126 | nop.m 0 | |
127 | // 5/2 in r2 | |
128 | movl r2 = 0x40200000 | |
129 | } { .mlx | |
130 | nop.m 0 | |
131 | // 63/8 in r3 | |
132 | movl r3 = 0x40fc0000;; | |
133 | } { .mfi | |
134 | setf.s f11=r2 | |
135 | // Step (2) | |
136 | // h = +1/2 * y0 in f6 | |
137 | (p6) fma.s1 f6=f9,f7,f0 | |
138 | nop.i 0 | |
139 | } { .mfi | |
140 | setf.s f12=r3 | |
141 | // Step (3) | |
142 | // g = a * y0 in f7 | |
143 | (p6) fma.s1 f7=f8,f7,f0 | |
144 | nop.i 0 | |
145 | } { .mfi | |
146 | nop.m 0 | |
147 | mov f15 = f8 | |
148 | nop.i 0;; | |
149 | } { .mlx | |
150 | nop.m 0 | |
151 | // 231/16 in r2 | |
152 | movl r2 = 0x41670000;; | |
153 | } { .mfi | |
154 | setf.s f13=r2 | |
155 | // Step (4) | |
156 | // e = 1/2 - g * h in f9 | |
157 | (p6) fnma.s1 f9=f7,f6,f9 | |
158 | nop.i 0 | |
159 | } { .mlx | |
160 | nop.m 0 | |
161 | // 35/8 in r3 | |
162 | movl r3 = 0x408c0000;; | |
163 | } { .mfi | |
164 | setf.s f14=r3 | |
165 | // Step (5) | |
166 | // S = 3/2 + 5/2 * e in f10 | |
167 | (p6) fma.s1 f10=f11,f9,f10 | |
168 | nop.i 0 | |
169 | } { .mfi | |
170 | nop.m 0 | |
171 | // Step (6) | |
172 | // e2 = e * e in f11 | |
173 | (p6) fma.s1 f11=f9,f9,f0 | |
174 | nop.i 0;; | |
175 | } { .mfi | |
176 | nop.m 0 | |
177 | // Step (7) | |
178 | // t = 63/8 + 231/16 * e in f12 | |
179 | (p6) fma.s1 f12=f13,f9,f12 | |
180 | nop.i 0;; | |
181 | } { .mfi | |
182 | nop.m 0 | |
183 | // Step (8) | |
184 | // S1 = e + e2 * S in f10 | |
185 | (p6) fma.s1 f10=f11,f10,f9 | |
186 | nop.i 0 | |
187 | } { .mfi | |
188 | nop.m 0 | |
189 | // Step (9) | |
190 | // e4 = e2 * e2 in f11 | |
191 | (p6) fma.s1 f11=f11,f11,f0 | |
192 | nop.i 0;; | |
193 | } { .mfi | |
194 | nop.m 0 | |
195 | // Step (10) | |
196 | // t1 = 35/8 + e * t in f9 | |
197 | (p6) fma.s1 f9=f9,f12,f14 | |
198 | nop.i 0;; | |
199 | } { .mfi | |
200 | nop.m 0 | |
201 | // Step (11) | |
202 | // G = g + S1 * g in f12 | |
203 | (p6) fma.s1 f12=f10,f7,f7 | |
204 | nop.i 0 | |
205 | } { .mfi | |
206 | nop.m 0 | |
207 | // Step (12) | |
208 | // E = g * e4 in f7 | |
209 | (p6) fma.s1 f7=f7,f11,f0 | |
210 | nop.i 0;; | |
211 | } { .mfi | |
212 | nop.m 0 | |
213 | // Step (13) | |
214 | // u = S1 + e4 * t1 in f10 | |
215 | (p6) fma.s1 f10=f11,f9,f10 | |
216 | nop.i 0;; | |
217 | } { .mfi | |
218 | nop.m 0 | |
219 | // Step (14) | |
220 | // g1 = G + t1 * E in f7 | |
221 | (p6) fma.d.s1 f7=f9,f7,f12 | |
222 | nop.i 0;; | |
223 | } { .mfi | |
224 | nop.m 0 | |
225 | // Step (15) | |
226 | // h1 = h + u * h in f6 | |
227 | (p6) fma.s1 f6=f10,f6,f6 | |
228 | nop.i 0;; | |
229 | } { .mfi | |
230 | nop.m 0 | |
231 | // Step (16) | |
232 | // d = a - g1 * g1 in f9 | |
233 | (p6) fnma.s1 f9=f7,f7,f8 | |
234 | nop.i 0;; | |
235 | } { .mfb | |
236 | nop.m 0 | |
237 | // Step (17) | |
238 | // g2 = g1 + d * h1 in f7 | |
239 | (p6) fma.d.s0 f8=f9,f6,f7 | |
240 | (p6) br.ret.sptk b0 ;; | |
0347518d | 241 | } |
d5efd131 MF |
242 | |
243 | { .mfb | |
244 | nop.m 0 | |
245 | mov f8 = f7 | |
246 | (p8) br.ret.sptk b0 ;; | |
247 | } | |
248 | { .mfb | |
249 | (p7) mov r40 = 49 | |
250 | nop.f 0 | |
251 | (p7) br.cond.sptk __libm_error_region ;; | |
252 | } | |
253 | // END DOUBLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM | |
254 | GLOBAL_IEEE754_END(sqrt) | |
255 | ||
256 | // Stack operations when calling error support. | |
257 | // (1) (2) (3) (call) (4) | |
258 | // sp -> + psp -> + psp -> + sp -> + | |
259 | // | | | | | |
260 | // | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 | |
261 | // | | | | | |
262 | // | <-GR_Y Y2->| Y2 ->| <- GR_Y | | |
263 | // | | | | | |
264 | // | | <- GR_X X1 ->| | | |
265 | // | | | | | |
266 | // sp-64 -> + sp -> + sp -> + + | |
267 | // save ar.pfs save b0 restore gp | |
268 | // save gp restore ar.pfs | |
269 | ||
270 | ||
271 | LOCAL_LIBM_ENTRY(__libm_error_region) | |
272 | ||
273 | // | |
274 | // This branch includes all those special values that are not negative, | |
275 | // with the result equal to frcpa(x) | |
0347518d | 276 | // |
d5efd131 MF |
277 | |
278 | .prologue | |
279 | // We are distinguishing between over(under)flow and letting | |
280 | // __libm_error_support set ERANGE or do anything else needed. | |
281 | ||
282 | // (1) | |
283 | { .mfi | |
284 | add GR_Parameter_Y=-32,sp // Parameter 2 value | |
285 | nop.f 0 | |
286 | .save ar.pfs,GR_SAVE_PFS | |
287 | mov GR_SAVE_PFS=ar.pfs // Save ar.pfs | |
288 | } | |
289 | { .mfi | |
290 | .fframe 64 | |
291 | add sp=-64,sp // Create new stack | |
292 | nop.f 0 | |
293 | mov GR_SAVE_GP=gp // Save gp | |
294 | };; | |
295 | ||
296 | ||
297 | // (2) | |
298 | { .mmi | |
299 | stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack | |
300 | add GR_Parameter_X = 16,sp // Parameter 1 address | |
301 | .save b0, GR_SAVE_B0 | |
302 | mov GR_SAVE_B0=b0 // Save b0 | |
303 | };; | |
304 | ||
305 | .body | |
306 | // (3) | |
307 | { .mib | |
308 | stfd [GR_Parameter_X] = f15 // STORE Parameter 1 on stack | |
309 | add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address | |
0347518d | 310 | nop.b 0 |
d5efd131 MF |
311 | } |
312 | { .mib | |
313 | stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack | |
314 | add GR_Parameter_Y = -16,GR_Parameter_Y | |
315 | br.call.sptk b0=__libm_error_support# // Call error handling function | |
316 | };; | |
317 | { .mmi | |
318 | nop.m 0 | |
319 | nop.m 0 | |
320 | add GR_Parameter_RESULT = 48,sp | |
321 | };; | |
322 | ||
323 | // (4) | |
324 | { .mmi | |
325 | ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack | |
326 | .restore sp | |
327 | add sp = 64,sp // Restore stack pointer | |
328 | mov b0 = GR_SAVE_B0 // Restore return address | |
329 | };; | |
330 | { .mib | |
331 | mov gp = GR_SAVE_GP // Restore gp | |
332 | mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs | |
333 | br.ret.sptk b0 // Return | |
334 | };; | |
335 | ||
336 | LOCAL_LIBM_END(__libm_error_region) | |
337 | ||
338 | ||
339 | ||
340 | ||
341 | .type __libm_error_support#,@function | |
342 | .global __libm_error_support# |