]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/ia64/fpu/s_log1pf.S
(CFLAGS-tst-align.c): Add -mpreferred-stack-boundary=4.
[thirdparty/glibc.git] / sysdeps / ia64 / fpu / s_log1pf.S
1 .file "log1pf.s"
2
3 // Copyright (C) 2000, 2001, Intel Corporation
4 // All rights reserved.
5 //
6 // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
7 // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // * Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // * Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // * The name of Intel Corporation may not be used to endorse or promote
21 // products derived from this software without specific prior written
22 // permission.
23 //
24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 //
36 // Intel Corporation is the author of this code, and requests that all
37 // problem reports or change requests be submitted to it directly at
38 // http://developer.intel.com/opensource.
39 //
40 // History
41 //==============================================================
42 // 2/02/00 Initial version
43 // 4/04/00 Unwind support added
44 // 8/15/00 Bundle added after call to __libm_error_support to properly
45 // set [the previously overwritten] GR_Parameter_RESULT.
46 //
47 // *********************************************************************
48 //
49 // Function: log1pf(x) = ln(x+1), for single precision values
50 //
51 // *********************************************************************
52 //
53 // Accuracy: Very accurate for single precision values
54 //
55 // *********************************************************************
56 //
57 // Resources Used:
58 //
59 // Floating-Point Registers: f8 (Input and Return Value)
60 // f9,f33-f55,f99
61 //
62 // General Purpose Registers:
63 // r32-r53
64 // r54-r57 (Used to pass arguments to error handling routine)
65 //
66 // Predicate Registers: p6-p15
67 //
68 // *********************************************************************
69 //
70 // IEEE Special Conditions:
71 //
72 // Denormal fault raised on denormal inputs
73 // Overflow exceptions cannot occur
74 // Underflow exceptions raised when appropriate for log1pf
75 // (Error Handling Routine called for underflow)
76 // Inexact raised when appropriate by algorithm
77 //
78 // log1pf(inf) = inf
79 // log1pf(-inf) = QNaN
80 // log1pf(+/-0) = +/-0
81 // log1pf(-1) = -inf
82 // log1pf(SNaN) = QNaN
83 // log1pf(QNaN) = QNaN
84 // log1pf(EM_special Values) = QNaN
85 //
86 // *********************************************************************
87 //
88 // Computation is based on the following kernel.
89 //
90 // ker_log_64( in_FR : X,
91 // in_FR : E,
92 // in_FR : Em1,
93 // in_GR : Expo_Range,
94 // out_FR : Y_hi,
95 // out_FR : Y_lo,
96 // out_FR : Scale,
97 // out_PR : Safe )
98 //
99 // Overview
100 //
101 // The method consists of three cases.
102 //
103 // If |X+Em1| < 2^(-80) use case log1pf_small;
104 // elseif |X+Em1| < 2^(-7) use case log_near1;
105 // else use case log_regular;
106 //
107 // Case log1pf_small:
108 //
109 // log( 1 + (X+Em1) ) can be approximated by (X+Em1).
110 //
111 // Case log_near1:
112 //
113 // log( 1 + (X+Em1) ) can be approximated by a simple polynomial
114 // in W = X+Em1. This polynomial resembles the truncated Taylor
115 // series W - W^/2 + W^3/3 - ...
116 //
117 // Case log_regular:
118 //
119 // Here we use a table lookup method. The basic idea is that in
120 // order to compute log(Arg) for an argument Arg in [1,2), we
121 // construct a value G such that G*Arg is close to 1 and that
122 // log(1/G) is obtainable easily from a table of values calculated
123 // beforehand. Thus
124 //
125 // log(Arg) = log(1/G) + log(G*Arg)
126 // = log(1/G) + log(1 + (G*Arg - 1))
127 //
128 // Because |G*Arg - 1| is small, the second term on the right hand
129 // side can be approximated by a short polynomial. We elaborate
130 // this method in four steps.
131 //
132 // Step 0: Initialization
133 //
134 // We need to calculate log( E + X ). Obtain N, S_hi, S_lo such that
135 //
136 // E + X = 2^N * ( S_hi + S_lo ) exactly
137 //
138 // where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
139 // that |S_lo| <= ulp(S_hi).
140 //
141 // Step 1: Argument Reduction
142 //
143 // Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
144 //
145 // G := G_1 * G_2 * G_3
146 // r := (G * S_hi - 1) + G * S_lo
147 //
148 // These G_j's have the property that the product is exactly
149 // representable and that |r| < 2^(-12) as a result.
150 //
151 // Step 2: Approximation
152 //
153 //
154 // log(1 + r) is approximated by a short polynomial poly(r).
155 //
156 // Step 3: Reconstruction
157 //
158 //
159 // Finally, log( E + X ) is given by
160 //
161 // log( E + X ) = log( 2^N * (S_hi + S_lo) )
162 // ~=~ N*log(2) + log(1/G) + log(1 + r)
163 // ~=~ N*log(2) + log(1/G) + poly(r).
164 //
165 // **** Algorithm ****
166 //
167 // Case log1pf_small:
168 //
169 // Although log(1 + (X+Em1)) is basically X+Em1, we would like to
170 // preserve the inexactness nature as well as consistent behavior
171 // under different rounding modes. Note that this case can only be
172 // taken if E is set to be 1.0. In this case, Em1 is zero, and that
173 // X can be very tiny and thus the final result can possibly underflow.
174 // Thus, we compare X against a threshold that is dependent on the
175 // input Expo_Range. If |X| is smaller than this threshold, we set
176 // SAFE to be FALSE.
177 //
178 // The result is returned as Y_hi, Y_lo, and in the case of SAFE
179 // is FALSE, an additional value Scale is also returned.
180 //
181 // W := X + Em1
182 // Threshold := Threshold_Table( Expo_Range )
183 // Tiny := Tiny_Table( Expo_Range )
184 //
185 // If ( |W| > Threshold ) then
186 // Y_hi := W
187 // Y_lo := -W*W
188 // Else
189 // Y_hi := W
190 // Y_lo := -Tiny
191 // Scale := 2^(-100)
192 // Safe := FALSE
193 // EndIf
194 //
195 //
196 // One may think that Y_lo should be -W*W/2; however, it does not matter
197 // as Y_lo will be rounded off completely except for the correct effect in
198 // directed rounding. Clearly -W*W is simplier to compute. Moreover,
199 // because of the difference in exponent value, Y_hi + Y_lo or
200 // Y_hi + Scale*Y_lo is always inexact.
201 //
202 // Case log_near1:
203 //
204 // Here we compute a simple polynomial. To exploit parallelism, we split
205 // the polynomial into two portions.
206 //
207 // W := X + Em1
208 // Wsq := W * W
209 // W4 := Wsq*Wsq
210 // W6 := W4*Wsq
211 // Y_hi := W + Wsq*(P_1 + W*(P_2 + W*(P_3 + W*P_4))
212 // Y_lo := W6*(P_5 + W*(P_6 + W*(P_7 + W*P_8)))
213 // set lsb(Y_lo) to be 1
214 //
215 // Case log_regular:
216 //
217 // We present the algorithm in four steps.
218 //
219 // Step 0. Initialization
220 // ----------------------
221 //
222 // Z := X + E
223 // N := unbaised exponent of Z
224 // S_hi := 2^(-N) * Z
225 // S_lo := 2^(-N) * { (max(X,E)-Z) + min(X,E) }
226 //
227 // Note that S_lo is always 0 for the case E = 0.
228 //
229 // Step 1. Argument Reduction
230 // --------------------------
231 //
232 // Let
233 //
234 // Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63
235 //
236 // We obtain G_1, G_2, G_3 by the following steps.
237 //
238 //
239 // Define X_0 := 1.d_1 d_2 ... d_14. This is extracted
240 // from S_hi.
241 //
242 // Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated
243 // to lsb = 2^(-4).
244 //
245 // Define index_1 := [ d_1 d_2 d_3 d_4 ].
246 //
247 // Fetch Z_1 := (1/A_1) rounded UP in fixed point with
248 // fixed point lsb = 2^(-15).
249 // Z_1 looks like z_0.z_1 z_2 ... z_15
250 // Note that the fetching is done using index_1.
251 // A_1 is actually not needed in the implementation
252 // and is used here only to explain how is the value
253 // Z_1 defined.
254 //
255 // Fetch G_1 := (1/A_1) truncated to 21 sig. bits.
256 // floating pt. Again, fetching is done using index_1. A_1
257 // explains how G_1 is defined.
258 //
259 // Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14)
260 // = 1.0 0 0 0 d_5 ... d_14
261 // This is accomplised by integer multiplication.
262 // It is proved that X_1 indeed always begin
263 // with 1.0000 in fixed point.
264 //
265 //
266 // Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1
267 // truncated to lsb = 2^(-8). Similar to A_1,
268 // A_2 is not needed in actual implementation. It
269 // helps explain how some of the values are defined.
270 //
271 // Define index_2 := [ d_5 d_6 d_7 d_8 ].
272 //
273 // Fetch Z_2 := (1/A_2) rounded UP in fixed point with
274 // fixed point lsb = 2^(-15). Fetch done using index_2.
275 // Z_2 looks like z_0.z_1 z_2 ... z_15
276 //
277 // Fetch G_2 := (1/A_2) truncated to 21 sig. bits.
278 // floating pt.
279 //
280 // Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14)
281 // = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14
282 // This is accomplised by integer multiplication.
283 // It is proved that X_2 indeed always begin
284 // with 1.00000000 in fixed point.
285 //
286 //
287 // Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1.
288 // This is 2^(-14) + X_2 truncated to lsb = 2^(-13).
289 //
290 // Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ].
291 //
292 // Fetch G_3 := (1/A_3) truncated to 21 sig. bits.
293 // floating pt. Fetch is done using index_3.
294 //
295 // Compute G := G_1 * G_2 * G_3.
296 //
297 // This is done exactly since each of G_j only has 21 sig. bits.
298 //
299 // Compute
300 //
301 // r := (G*S_hi - 1) + G*S_lo using 2 FMA operations.
302 //
303 // thus, r approximates G*(S_hi+S_lo) - 1 to within a couple of
304 // rounding errors.
305 //
306 //
307 // Step 2. Approximation
308 // ---------------------
309 //
310 // This step computes an approximation to log( 1 + r ) where r is the
311 // reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13);
312 // thus log(1+r) can be approximated by a short polynomial:
313 //
314 // log(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5
315 //
316 //
317 // Step 3. Reconstruction
318 // ----------------------
319 //
320 // This step computes the desired result of log(X+E):
321 //
322 // log(X+E) = log( 2^N * (S_hi + S_lo) )
323 // = N*log(2) + log( S_hi + S_lo )
324 // = N*log(2) + log(1/G) +
325 // log(1 + C*(S_hi+S_lo) - 1 )
326 //
327 // log(2), log(1/G_j) are stored as pairs of (single,double) numbers:
328 // log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are
329 // single-precision numbers and the low parts are double precision
330 // numbers. These have the property that
331 //
332 // N*log2_hi + SUM ( log1byGj_hi )
333 //
334 // is computable exactly in double-extended precision (64 sig. bits).
335 // Finally
336 //
337 // Y_hi := N*log2_hi + SUM ( log1byGj_hi )
338 // Y_lo := poly_hi + [ poly_lo +
339 // ( SUM ( log1byGj_lo ) + N*log2_lo ) ]
340 // set lsb(Y_lo) to be 1
341 //
342
343 #include "libm_support.h"
344
345 #ifdef _LIBC
346 .rodata
347 #else
348 .data
349 #endif
350
351 // P_7, P_6, P_5, P_4, P_3, P_2, and P_1
352
353 .align 64
354 Constants_P:
355 ASM_TYPE_DIRECTIVE(Constants_P,@object)
356 data4 0xEFD62B15,0xE3936754,0x00003FFB,0x00000000
357 data4 0xA5E56381,0x8003B271,0x0000BFFC,0x00000000
358 data4 0x73282DB0,0x9249248C,0x00003FFC,0x00000000
359 data4 0x47305052,0xAAAAAA9F,0x0000BFFC,0x00000000
360 data4 0xCCD17FC9,0xCCCCCCCC,0x00003FFC,0x00000000
361 data4 0x00067ED5,0x80000000,0x0000BFFD,0x00000000
362 data4 0xAAAAAAAA,0xAAAAAAAA,0x00003FFD,0x00000000
363 data4 0xFFFFFFFE,0xFFFFFFFF,0x0000BFFD,0x00000000
364 ASM_SIZE_DIRECTIVE(Constants_P)
365
366 // log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1
367
368 .align 64
369 Constants_Q:
370 ASM_TYPE_DIRECTIVE(Constants_Q,@object)
371 data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
372 data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
373 data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
374 data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
375 data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
376 data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
377 ASM_SIZE_DIRECTIVE(Constants_Q)
378
379 // Z1 - 16 bit fixed, G1 and H1 - IEEE single
380
381 .align 64
382 Constants_Z_G_H_h1:
383 ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h1,@object)
384 data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000
385 data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000,0x617D741C,0x3DA163A6
386 data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000,0xCBD3D5BB,0x3E2C55E6
387 data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000,0xD86EA5E7,0xBE3EB0BF
388 data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000,0x86B12760,0x3E2E6A8C
389 data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000,0x5C0739BA,0x3E47574C
390 data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000,0x13E8AF2F,0x3E20E30F
391 data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000,0xF2C630BD,0xBE42885B
392 data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000,0x97E577C6,0x3E497F34
393 data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000,0xA6B0A5AB,0x3E3E6A6E
394 data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000,0xD328D9BE,0xBDF43E3C
395 data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000,0x0ADB090A,0x3E4094C3
396 data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000,0xFC1FE510,0xBE28FBB2
397 data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000,0x10FDE3FA,0x3E3A7895
398 data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000,0x7CC8C98F,0x3E508CE5
399 data4 0x00004211,0x3F042108,0x3F29516A,0x00000000,0xA223106C,0xBE534874
400 ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h1)
401
402 // Z2 - 16 bit fixed, G2 and H2 - IEEE single
403
404 .align 64
405 Constants_Z_G_H_h2:
406 ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h2,@object)
407 data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000
408 data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000,0x22C42273,0x3DB5A116
409 data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000,0x21F86ED3,0x3DE620CF
410 data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000,0x484F34ED,0xBDAFA07E
411 data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000,0x3860BCF6,0xBDFE07F0
412 data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000,0xA78093D6,0x3DEA370F
413 data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000,0x72A753D0,0x3DFF5791
414 data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000,0xA7EF896B,0x3DFEBE6C
415 data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000,0x409ECB43,0x3E0CF156
416 data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000,0xFFEF71DF,0xBE0B6F97
417 data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000,0x5D59EEE8,0xBE080483
418 data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000,0xA9192A74,0x3E1F91E9
419 data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000,0xBF72A8CD,0xBE139A06
420 data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000,0xF8FBA6CF,0x3E1D9202
421 data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000,0xBA796223,0xBE1DCCC4
422 data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000,0xB6B7C239,0xBE049391
423 ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h2)
424
425 // G3 and H3 - IEEE single and h3 -IEEE double
426
427 .align 64
428 Constants_Z_G_H_h3:
429 ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h3,@object)
430 data4 0x3F7FFC00,0x38800100,0x562224CD,0x3D355595
431 data4 0x3F7FF400,0x39400480,0x06136FF6,0x3D8200A2
432 data4 0x3F7FEC00,0x39A00640,0xE8DE9AF0,0x3DA4D68D
433 data4 0x3F7FE400,0x39E00C41,0xB10238DC,0xBD8B4291
434 data4 0x3F7FDC00,0x3A100A21,0x3B1952CA,0xBD89CCB8
435 data4 0x3F7FD400,0x3A300F22,0x1DC46826,0xBDB10707
436 data4 0x3F7FCC08,0x3A4FF51C,0xF43307DB,0x3DB6FCB9
437 data4 0x3F7FC408,0x3A6FFC1D,0x62DC7872,0xBD9B7C47
438 data4 0x3F7FBC10,0x3A87F20B,0x3F89154A,0xBDC3725E
439 data4 0x3F7FB410,0x3A97F68B,0x62B9D392,0xBD93519D
440 data4 0x3F7FAC18,0x3AA7EB86,0x0F21BD9D,0x3DC18441
441 data4 0x3F7FA420,0x3AB7E101,0x2245E0A6,0xBDA64B95
442 data4 0x3F7F9C20,0x3AC7E701,0xAABB34B8,0x3DB4B0EC
443 data4 0x3F7F9428,0x3AD7DD7B,0x6DC40A7E,0x3D992337
444 data4 0x3F7F8C30,0x3AE7D474,0x4F2083D3,0x3DC6E17B
445 data4 0x3F7F8438,0x3AF7CBED,0x811D4394,0x3DAE314B
446 data4 0x3F7F7C40,0x3B03E1F3,0xB08F2DB1,0xBDD46F21
447 data4 0x3F7F7448,0x3B0BDE2F,0x6D34522B,0xBDDC30A4
448 data4 0x3F7F6C50,0x3B13DAAA,0xB1F473DB,0x3DCB0070
449 data4 0x3F7F6458,0x3B1BD766,0x6AD282FD,0xBDD65DDC
450 data4 0x3F7F5C68,0x3B23CC5C,0xF153761A,0xBDCDAB83
451 data4 0x3F7F5470,0x3B2BC997,0x341D0F8F,0xBDDADA40
452 data4 0x3F7F4C78,0x3B33C711,0xEBC394E8,0x3DCD1BD7
453 data4 0x3F7F4488,0x3B3BBCC6,0x52E3E695,0xBDC3532B
454 data4 0x3F7F3C90,0x3B43BAC0,0xE846B3DE,0xBDA3961E
455 data4 0x3F7F34A0,0x3B4BB0F4,0x785778D4,0xBDDADF06
456 data4 0x3F7F2CA8,0x3B53AF6D,0xE55CE212,0x3DCC3ED1
457 data4 0x3F7F24B8,0x3B5BA620,0x9E382C15,0xBDBA3103
458 data4 0x3F7F1CC8,0x3B639D12,0x5C5AF197,0x3D635A0B
459 data4 0x3F7F14D8,0x3B6B9444,0x71D34EFC,0xBDDCCB19
460 data4 0x3F7F0CE0,0x3B7393BC,0x52CD7ADA,0x3DC74502
461 data4 0x3F7F04F0,0x3B7B8B6D,0x7D7F2A42,0xBDB68F17
462 ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h3)
463
464 //
465 // Exponent Thresholds and Tiny Thresholds
466 // for 8, 11, 15, and 17 bit exponents
467 //
468 // Expo_Range Value
469 //
470 // 0 (8 bits) 2^(-126)
471 // 1 (11 bits) 2^(-1022)
472 // 2 (15 bits) 2^(-16382)
473 // 3 (17 bits) 2^(-16382)
474 //
475 // Tiny_Table
476 // ----------
477 // Expo_Range Value
478 //
479 // 0 (8 bits) 2^(-16382)
480 // 1 (11 bits) 2^(-16382)
481 // 2 (15 bits) 2^(-16382)
482 // 3 (17 bits) 2^(-16382)
483 //
484
485 .align 64
486 Constants_Threshold:
487 ASM_TYPE_DIRECTIVE(Constants_Threshold,@object)
488 data4 0x00000000,0x80000000,0x00003F81,0x00000000
489 data4 0x00000000,0x80000000,0x00000001,0x00000000
490 data4 0x00000000,0x80000000,0x00003C01,0x00000000
491 data4 0x00000000,0x80000000,0x00000001,0x00000000
492 data4 0x00000000,0x80000000,0x00000001,0x00000000
493 data4 0x00000000,0x80000000,0x00000001,0x00000000
494 data4 0x00000000,0x80000000,0x00000001,0x00000000
495 data4 0x00000000,0x80000000,0x00000001,0x00000000
496 ASM_SIZE_DIRECTIVE(Constants_Threshold)
497
498 .align 64
499 Constants_1_by_LN10:
500 ASM_TYPE_DIRECTIVE(Constants_1_by_LN10,@object)
501 data4 0x37287195,0xDE5BD8A9,0x00003FFD,0x00000000
502 data4 0xACCF70C8,0xD56EAABE,0x00003FBD,0x00000000
503 ASM_SIZE_DIRECTIVE(Constants_1_by_LN10)
504
505 FR_Input_X = f8
506 FR_Neg_One = f9
507 FR_E = f33
508 FR_Em1 = f34
509 FR_Y_hi = f34
510 // Shared with Em1
511 FR_Y_lo = f35
512 FR_Scale = f36
513 FR_X_Prime = f37
514 FR_Z = f38
515 FR_S_hi = f38
516 // Shared with Z
517 FR_W = f39
518 FR_G = f40
519 FR_wsq = f40
520 // Shared with G
521 FR_H = f41
522 FR_w4 = f41
523 // Shared with H
524 FR_h = f42
525 FR_w6 = f42
526 // Shared with h
527 FR_G_tmp = f43
528 FR_poly_lo = f43
529 // Shared with G_tmp
530 FR_P8 = f43
531 // Shared with G_tmp
532 FR_H_tmp = f44
533 FR_poly_hi = f44
534 // Shared with H_tmp
535 FR_P7 = f44
536 // Shared with H_tmp
537 FR_h_tmp = f45
538 FR_rsq = f45
539 // Shared with h_tmp
540 FR_P6 = f45
541 // Shared with h_tmp
542 FR_abs_W = f46
543 FR_r = f46
544 // Shared with abs_W
545 FR_AA = f47
546 FR_log2_hi = f47
547 // Shared with AA
548 FR_BB = f48
549 FR_log2_lo = f48
550 // Shared with BB
551 FR_S_lo = f49
552 FR_two_negN = f50
553 FR_float_N = f51
554 FR_Q4 = f52
555 FR_dummy = f52
556 // Shared with Q4
557 FR_P4 = f52
558 // Shared with Q4
559 FR_Threshold = f52
560 // Shared with Q4
561 FR_Q3 = f53
562 FR_P3 = f53
563 // Shared with Q3
564 FR_Tiny = f53
565 // Shared with Q3
566 FR_Q2 = f54
567 FR_P2 = f54
568 // Shared with Q2
569 FR_1LN10_hi = f54
570 // Shared with Q2
571 FR_Q1 = f55
572 FR_P1 = f55
573 // Shared with Q1
574 FR_1LN10_lo = f55
575 // Shared with Q1
576 FR_P5 = f98
577 FR_SCALE = f98
578 FR_Output_X_tmp = f99
579
580 GR_Expo_Range = r32
581 GR_Table_Base = r34
582 GR_Table_Base1 = r35
583 GR_Table_ptr = r36
584 GR_Index2 = r37
585 GR_signif = r38
586 GR_X_0 = r39
587 GR_X_1 = r40
588 GR_X_2 = r41
589 GR_Z_1 = r42
590 GR_Z_2 = r43
591 GR_N = r44
592 GR_Bias = r45
593 GR_M = r46
594 GR_ScaleN = r47
595 GR_Index3 = r48
596 GR_Perturb = r49
597 GR_Table_Scale = r50
598
599
600 GR_SAVE_PFS = r51
601 GR_SAVE_B0 = r52
602 GR_SAVE_GP = r53
603
604 GR_Parameter_X = r54
605 GR_Parameter_Y = r55
606 GR_Parameter_RESULT = r56
607
608 GR_Parameter_TAG = r57
609
610
611 .section .text
612 .proc log1pf#
613 .global log1pf#
614 .align 64
615 log1pf:
616 #ifdef _LIBC
617 .global __log1pf
618 __log1pf:
619 #endif
620
621 { .mfi
622 alloc r32 = ar.pfs,0,22,4,0
623 (p0) fsub.s1 FR_Neg_One = f0,f1
624 (p0) cmp.eq.unc p7, p0 = r0, r0
625 }
626
627 { .mfi
628 (p0) cmp.ne.unc p14, p0 = r0, r0
629 (p0) fnorm.s1 FR_X_Prime = FR_Input_X
630 (p0) cmp.eq.unc p15, p0 = r0, r0 ;;
631 }
632
633 { .mfi
634 nop.m 999
635 (p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3
636 nop.i 999
637 }
638 ;;
639
640 { .mfi
641 nop.m 999
642 (p0) fclass.nm.unc p10, p0 = FR_Input_X, 0x1FF
643 nop.i 999
644 }
645 ;;
646
647 { .mfi
648 nop.m 999
649 (p0) fcmp.eq.unc.s1 p9, p0 = FR_Input_X, f0
650 nop.i 999
651 }
652
653 { .mfi
654 nop.m 999
655 (p0) fadd FR_Em1 = f0,f0
656 nop.i 999 ;;
657 }
658
659 { .mfi
660 nop.m 999
661 (p0) fadd FR_E = f0,f1
662 nop.i 999 ;;
663 }
664
665 { .mfi
666 nop.m 999
667 (p0) fcmp.eq.unc.s1 p8, p0 = FR_Input_X, FR_Neg_One
668 nop.i 999
669 }
670
671 { .mfi
672 nop.m 999
673 (p0) fcmp.lt.unc.s1 p13, p0 = FR_Input_X, FR_Neg_One
674 nop.i 999
675 }
676
677
678 L(LOG_BEGIN):
679
680 { .mfi
681 nop.m 999
682 (p0) fadd.s1 FR_Z = FR_X_Prime, FR_E
683 nop.i 999
684 }
685
686 { .mlx
687 nop.m 999
688 (p0) movl GR_Table_Scale = 0x0000000000000018 ;;
689 }
690
691 { .mmi
692 nop.m 999
693 //
694 // Create E = 1 and Em1 = 0
695 // Check for X == 0, meaning log(1+0)
696 // Check for X < -1, meaning log(negative)
697 // Check for X == -1, meaning log(0)
698 // Normalize x
699 // Identify NatVals, NaNs, Infs.
700 // Identify EM unsupporteds.
701 // Identify Negative values - us S1 so as
702 // not to raise denormal operand exception
703 // Set p15 to true for log1pf
704 // Set p14 to false for log1pf
705 // Set p7 true for log and log1pf
706 //
707 (p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h1#),gp
708 nop.i 999
709 }
710
711 { .mfi
712 nop.m 999
713 (p0) fmax.s1 FR_AA = FR_X_Prime, FR_E
714 nop.i 999 ;;
715 }
716
717 { .mfi
718 ld8 GR_Table_Base = [GR_Table_Base]
719 (p0) fmin.s1 FR_BB = FR_X_Prime, FR_E
720 nop.i 999
721 }
722
723 { .mfb
724 nop.m 999
725 (p0) fadd.s1 FR_W = FR_X_Prime, FR_Em1
726 //
727 // Begin load of constants base
728 // FR_Z = Z = |x| + E
729 // FR_W = W = |x| + Em1
730 // AA = fmax(|x|,E)
731 // BB = fmin(|x|,E)
732 //
733 (p6) br.cond.spnt L(LOG_64_special) ;;
734 }
735
736 { .mib
737 nop.m 999
738 nop.i 999
739 (p10) br.cond.spnt L(LOG_64_unsupported) ;;
740 }
741
742 { .mib
743 nop.m 999
744 nop.i 999
745 (p13) br.cond.spnt L(LOG_64_negative) ;;
746 }
747
748 { .mib
749 (p0) getf.sig GR_signif = FR_Z
750 nop.i 999
751 (p9) br.cond.spnt L(LOG_64_one) ;;
752 }
753
754 { .mib
755 nop.m 999
756 nop.i 999
757 (p8) br.cond.spnt L(LOG_64_zero) ;;
758 }
759
760 { .mfi
761 (p0) getf.exp GR_N = FR_Z
762 //
763 // Raise possible denormal operand exception
764 // Create Bias
765 //
766 // This function computes ln( x + e )
767 // Input FR 1: FR_X = FR_Input_X
768 // Input FR 2: FR_E = FR_E
769 // Input FR 3: FR_Em1 = FR_Em1
770 // Input GR 1: GR_Expo_Range = GR_Expo_Range = 1
771 // Output FR 4: FR_Y_hi
772 // Output FR 5: FR_Y_lo
773 // Output FR 6: FR_Scale
774 // Output PR 7: PR_Safe
775 //
776 (p0) fsub.s1 FR_S_lo = FR_AA, FR_Z
777 //
778 // signif = getf.sig(Z)
779 // abs_W = fabs(w)
780 //
781 (p0) extr.u GR_Table_ptr = GR_signif, 59, 4 ;;
782 }
783
784 { .mfi
785 nop.m 999
786 (p0) fmerge.se FR_S_hi = f1,FR_Z
787 (p0) extr.u GR_X_0 = GR_signif, 49, 15
788 }
789
790 { .mmi
791 nop.m 999
792 (p0) addl GR_Table_Base1 = @ltoff(Constants_Z_G_H_h2#),gp
793 nop.i 999
794 }
795 ;;
796
797 { .mlx
798 ld8 GR_Table_Base1 = [GR_Table_Base1]
799 (p0) movl GR_Bias = 0x000000000000FFFF ;;
800 }
801
802 { .mfi
803 nop.m 999
804 (p0) fabs FR_abs_W = FR_W
805 (p0) pmpyshr2.u GR_Table_ptr = GR_Table_ptr,GR_Table_Scale,0
806 }
807
808 { .mfi
809 nop.m 999
810 //
811 // Branch out for special input values
812 //
813 (p0) fcmp.lt.unc.s0 p8, p0 = FR_Input_X, f0
814 nop.i 999 ;;
815 }
816
817 { .mfi
818 nop.m 999
819 //
820 // X_0 = extr.u(signif,49,15)
821 // Index1 = extr.u(signif,59,4)
822 //
823 (p0) fadd.s1 FR_S_lo = FR_S_lo, FR_BB
824 nop.i 999 ;;
825 }
826
827 { .mii
828 nop.m 999
829 nop.i 999 ;;
830 //
831 // Offset_to_Z1 = 24 * Index1
832 // For performance, don't use result
833 // for 3 or 4 cycles.
834 //
835 (p0) add GR_Table_ptr = GR_Table_ptr, GR_Table_Base ;;
836 }
837 //
838 // Add Base to Offset for Z1
839 // Create Bias
840
841 { .mmi
842 (p0) ld4 GR_Z_1 = [GR_Table_ptr],4 ;;
843 (p0) ldfs FR_G = [GR_Table_ptr],4
844 nop.i 999 ;;
845 }
846
847 { .mmi
848 (p0) ldfs FR_H = [GR_Table_ptr],8 ;;
849 (p0) ldfd FR_h = [GR_Table_ptr],0
850 (p0) pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
851 }
852 //
853 // Load Z_1
854 // Get Base of Table2
855 //
856
857 { .mfi
858 (p0) getf.exp GR_M = FR_abs_W
859 nop.f 999
860 nop.i 999 ;;
861 }
862
863 { .mii
864 nop.m 999
865 nop.i 999 ;;
866 //
867 // M = getf.exp(abs_W)
868 // S_lo = AA - Z
869 // X_1 = pmpyshr2(X_0,Z_1,15)
870 //
871 (p0) sub GR_M = GR_M, GR_Bias ;;
872 }
873 //
874 // M = M - Bias
875 // Load G1
876 // N = getf.exp(Z)
877 //
878
879 { .mii
880 (p0) cmp.gt.unc p11, p0 = -80, GR_M
881 (p0) cmp.gt.unc p12, p0 = -7, GR_M ;;
882 (p0) extr.u GR_Index2 = GR_X_1, 6, 4 ;;
883 }
884
885 { .mib
886 nop.m 999
887 //
888 // if -80 > M, set p11
889 // Index2 = extr.u(X_1,6,4)
890 // if -7 > M, set p12
891 // Load H1
892 //
893 (p0) pmpyshr2.u GR_Index2 = GR_Index2,GR_Table_Scale,0
894 (p11) br.cond.spnt L(log1pf_small) ;;
895 }
896
897 { .mib
898 nop.m 999
899 nop.i 999
900 (p12) br.cond.spnt L(log1pf_near) ;;
901 }
902
903 { .mii
904 (p0) sub GR_N = GR_N, GR_Bias
905 //
906 // poly_lo = r * poly_lo
907 //
908 (p0) add GR_Perturb = 0x1, r0 ;;
909 (p0) sub GR_ScaleN = GR_Bias, GR_N
910 }
911
912 { .mii
913 (p0) setf.sig FR_float_N = GR_N
914 nop.i 999 ;;
915 //
916 // Prepare Index2 - pmpyshr2.u(X_1,Z_2,15)
917 // Load h1
918 // S_lo = S_lo + BB
919 // Branch for -80 > M
920 //
921 (p0) add GR_Index2 = GR_Index2, GR_Table_Base1
922 }
923
924 { .mmi
925 (p0) setf.exp FR_two_negN = GR_ScaleN
926 nop.m 999
927 (p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h3#),gp
928 };;
929
930 //
931 // Index2 points to Z2
932 // Branch for -7 > M
933 //
934
935 { .mmb
936 (p0) ld4 GR_Z_2 = [GR_Index2],4
937 ld8 GR_Table_Base = [GR_Table_Base]
938 nop.b 999 ;;
939 }
940 (p0) nop.i 999
941 //
942 // Load Z_2
943 // N = N - Bias
944 // Tablebase points to Table3
945 //
946
947 { .mmi
948 (p0) ldfs FR_G_tmp = [GR_Index2],4 ;;
949 //
950 // Load G_2
951 // pmpyshr2 X_2= (X_1,Z_2,15)
952 // float_N = setf.sig(N)
953 // ScaleN = Bias - N
954 //
955 (p0) ldfs FR_H_tmp = [GR_Index2],8
956 nop.i 999 ;;
957 }
958 //
959 // Load H_2
960 // two_negN = setf.exp(scaleN)
961 // G = G_1 * G_2
962 //
963
964 { .mfi
965 (p0) ldfd FR_h_tmp = [GR_Index2],0
966 nop.f 999
967 (p0) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 ;;
968 }
969
970 { .mii
971 nop.m 999
972 (p0) extr.u GR_Index3 = GR_X_2, 1, 5 ;;
973 //
974 // Load h_2
975 // H = H_1 + H_2
976 // h = h_1 + h_2
977 // Index3 = extr.u(X_2,1,5)
978 //
979 (p0) shladd GR_Index3 = GR_Index3,4,GR_Table_Base
980 }
981
982 { .mmi
983 nop.m 999
984 nop.m 999
985 //
986 // float_N = fcvt.xf(float_N)
987 // load G3
988 //
989 (p0) addl GR_Table_Base = @ltoff(Constants_Q#),gp ;;
990 }
991
992 { .mfi
993 ld8 GR_Table_Base = [GR_Table_Base]
994 nop.f 999
995 nop.i 999
996 } ;;
997
998 { .mfi
999 (p0) ldfe FR_log2_hi = [GR_Table_Base],16
1000 (p0) fmpy.s1 FR_S_lo = FR_S_lo, FR_two_negN
1001 nop.i 999 ;;
1002 }
1003
1004 { .mmf
1005 nop.m 999
1006 //
1007 // G = G3 * G
1008 // Load h3
1009 // Load log2_hi
1010 // H = H + H3
1011 //
1012 (p0) ldfe FR_log2_lo = [GR_Table_Base],16
1013 (p0) fmpy.s1 FR_G = FR_G, FR_G_tmp ;;
1014 }
1015
1016 { .mmf
1017 (p0) ldfs FR_G_tmp = [GR_Index3],4
1018 //
1019 // h = h + h3
1020 // r = G * S_hi + 1
1021 // Load log2_lo
1022 //
1023 (p0) ldfe FR_Q4 = [GR_Table_Base],16
1024 (p0) fadd.s1 FR_h = FR_h, FR_h_tmp ;;
1025 }
1026
1027 { .mfi
1028 (p0) ldfe FR_Q3 = [GR_Table_Base],16
1029 (p0) fadd.s1 FR_H = FR_H, FR_H_tmp
1030 nop.i 999 ;;
1031 }
1032
1033 { .mmf
1034 (p0) ldfs FR_H_tmp = [GR_Index3],4
1035 (p0) ldfe FR_Q2 = [GR_Table_Base],16
1036 //
1037 // Comput Index for Table3
1038 // S_lo = S_lo * two_negN
1039 //
1040 (p0) fcvt.xf FR_float_N = FR_float_N ;;
1041 }
1042 //
1043 // If S_lo == 0, set p8 false
1044 // Load H3
1045 // Load ptr to table of polynomial coeff.
1046 //
1047
1048 { .mmf
1049 (p0) ldfd FR_h_tmp = [GR_Index3],0
1050 (p0) ldfe FR_Q1 = [GR_Table_Base],0
1051 (p0) fcmp.eq.unc.s1 p0, p8 = FR_S_lo, f0 ;;
1052 }
1053
1054 { .mfi
1055 nop.m 999
1056 (p0) fmpy.s1 FR_G = FR_G, FR_G_tmp
1057 nop.i 999 ;;
1058 }
1059
1060 { .mfi
1061 nop.m 999
1062 (p0) fadd.s1 FR_H = FR_H, FR_H_tmp
1063 nop.i 999 ;;
1064 }
1065
1066 { .mfi
1067 nop.m 999
1068 (p0) fms.s1 FR_r = FR_G, FR_S_hi, f1
1069 nop.i 999
1070 }
1071
1072 { .mfi
1073 nop.m 999
1074 (p0) fadd.s1 FR_h = FR_h, FR_h_tmp
1075 nop.i 999 ;;
1076 }
1077
1078 { .mfi
1079 nop.m 999
1080 (p0) fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H
1081 nop.i 999 ;;
1082 }
1083
1084 { .mfi
1085 nop.m 999
1086 //
1087 // Load Q4
1088 // Load Q3
1089 // Load Q2
1090 // Load Q1
1091 //
1092 (p8) fma.s1 FR_r = FR_G, FR_S_lo, FR_r
1093 nop.i 999
1094 }
1095
1096 { .mfi
1097 nop.m 999
1098 //
1099 // poly_lo = r * Q4 + Q3
1100 // rsq = r* r
1101 //
1102 (p0) fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h
1103 nop.i 999 ;;
1104 }
1105
1106 { .mfi
1107 nop.m 999
1108 //
1109 // If (S_lo!=0) r = s_lo * G + r
1110 //
1111 (p0) fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
1112 nop.i 999
1113 }
1114 //
1115 // Create a 0x00000....01
1116 // poly_lo = poly_lo * rsq + h
1117 //
1118
1119 { .mfi
1120 (p0) setf.sig FR_dummy = GR_Perturb
1121 (p0) fmpy.s1 FR_rsq = FR_r, FR_r
1122 nop.i 999 ;;
1123 }
1124
1125 { .mfi
1126 nop.m 999
1127 //
1128 // h = N * log2_lo + h
1129 // Y_hi = n * log2_hi + H
1130 //
1131 (p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
1132 nop.i 999
1133 }
1134
1135 { .mfi
1136 nop.m 999
1137 (p0) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
1138 nop.i 999 ;;
1139 }
1140
1141 { .mfi
1142 nop.m 999
1143 //
1144 // poly_lo = r * poly_o + Q2
1145 // poly_hi = Q1 * rsq + r
1146 //
1147 (p0) fmpy.s1 FR_poly_lo = FR_poly_lo, FR_r
1148 nop.i 999 ;;
1149 }
1150
1151 { .mfi
1152 nop.m 999
1153 (p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_rsq, FR_h
1154 nop.i 999 ;;
1155 }
1156
1157 { .mfb
1158 nop.m 999
1159 (p0) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo
1160 //
1161 // Create the FR for a binary "or"
1162 // Y_lo = poly_hi + poly_lo
1163 //
1164 // (p0) for FR_dummy = FR_Y_lo,FR_dummy ;;
1165 //
1166 // Turn the lsb of Y_lo ON
1167 //
1168 // (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;;
1169 //
1170 // Merge the new lsb into Y_lo, for alone doesn't
1171 //
1172 (p0) br.cond.sptk L(LOG_main) ;;
1173 }
1174
1175
1176 L(log1pf_near):
1177
1178 { .mmi
1179 nop.m 999
1180 nop.m 999
1181 // /*******************************************************/
1182 // /*********** Branch log1pf_near ************************/
1183 // /*******************************************************/
1184 (p0) addl GR_Table_Base = @ltoff(Constants_P#),gp ;;
1185 }
1186 //
1187 // Load base address of poly. coeff.
1188 //
1189 {.mmi
1190 nop.m 999
1191 ld8 GR_Table_Base = [GR_Table_Base]
1192 nop.i 999
1193 };;
1194
1195 { .mmb
1196 (p0) add GR_Table_ptr = 0x40,GR_Table_Base
1197 //
1198 // Address tables with separate pointers
1199 //
1200 (p0) ldfe FR_P8 = [GR_Table_Base],16
1201 nop.b 999 ;;
1202 }
1203
1204 { .mmb
1205 (p0) ldfe FR_P4 = [GR_Table_ptr],16
1206 //
1207 // Load P4
1208 // Load P8
1209 //
1210 (p0) ldfe FR_P7 = [GR_Table_Base],16
1211 nop.b 999 ;;
1212 }
1213
1214 { .mmf
1215 (p0) ldfe FR_P3 = [GR_Table_ptr],16
1216 //
1217 // Load P3
1218 // Load P7
1219 //
1220 (p0) ldfe FR_P6 = [GR_Table_Base],16
1221 (p0) fmpy.s1 FR_wsq = FR_W, FR_W ;;
1222 }
1223
1224 { .mfi
1225 (p0) ldfe FR_P2 = [GR_Table_ptr],16
1226 nop.f 999
1227 nop.i 999 ;;
1228 }
1229
1230 { .mfi
1231 nop.m 999
1232 (p0) fma.s1 FR_Y_hi = FR_W, FR_P4, FR_P3
1233 nop.i 999
1234 }
1235 //
1236 // Load P2
1237 // Load P6
1238 // Wsq = w * w
1239 // Y_hi = p4 * w + p3
1240 //
1241
1242 { .mfi
1243 (p0) ldfe FR_P5 = [GR_Table_Base],16
1244 (p0) fma.s1 FR_Y_lo = FR_W, FR_P8, FR_P7
1245 nop.i 999 ;;
1246 }
1247
1248 { .mfi
1249 (p0) ldfe FR_P1 = [GR_Table_ptr],16
1250 //
1251 // Load P1
1252 // Load P5
1253 // Y_lo = p8 * w + P7
1254 //
1255 (p0) fmpy.s1 FR_w4 = FR_wsq, FR_wsq
1256 nop.i 999 ;;
1257 }
1258
1259 { .mfi
1260 nop.m 999
1261 (p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P2
1262 nop.i 999
1263 }
1264
1265 { .mfi
1266 nop.m 999
1267 (p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P6
1268 (p0) add GR_Perturb = 0x1, r0 ;;
1269 }
1270
1271 { .mfi
1272 nop.m 999
1273 //
1274 // w4 = w2 * w2
1275 // Y_hi = y_hi * w + p2
1276 // Y_lo = y_lo * w + p6
1277 // Create perturbation bit
1278 //
1279 (p0) fmpy.s1 FR_w6 = FR_w4, FR_wsq
1280 nop.i 999 ;;
1281 }
1282
1283 { .mfi
1284 nop.m 999
1285 (p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P1
1286 nop.i 999
1287 }
1288 //
1289 // Y_hi = y_hi * w + p1
1290 // w6 = w4 * w2
1291 //
1292
1293 { .mfi
1294 (p0) setf.sig FR_Q4 = GR_Perturb
1295 (p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P5
1296 nop.i 999 ;;
1297 }
1298
1299 { .mfi
1300 nop.m 999
1301 (p0) fma.s1 FR_Y_hi = FR_wsq,FR_Y_hi, FR_W
1302 nop.i 999
1303 }
1304
1305 { .mfb
1306 nop.m 999
1307 //
1308 // Y_hi = y_hi * wsq + w
1309 // Y_lo = y_lo * w + p5
1310 //
1311 (p0) fmpy.s1 FR_Y_lo = FR_w6, FR_Y_lo
1312 //
1313 // Y_lo = y_lo * w6
1314 //
1315 // (p0) for FR_dummy = FR_Y_lo,FR_dummy ;;
1316 //
1317 // Set lsb on: Taken out to improve performance
1318 //
1319 // (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;;
1320 //
1321 // Make sure it's on in Y_lo also. Taken out to improve
1322 // performance
1323 //
1324 (p0) br.cond.sptk L(LOG_main) ;;
1325 }
1326
1327
1328 L(log1pf_small):
1329
1330 { .mmi
1331 nop.m 999
1332 nop.m 999
1333 // /*******************************************************/
1334 // /*********** Branch log1pf_small ***********************/
1335 // /*******************************************************/
1336 (p0) addl GR_Table_Base = @ltoff(Constants_Threshold#),gp
1337 }
1338
1339 { .mfi
1340 nop.m 999
1341 (p0) mov FR_Em1 = FR_W
1342 (p0) cmp.eq.unc p7, p0 = r0, r0 ;;
1343 }
1344
1345 { .mlx
1346 ld8 GR_Table_Base = [GR_Table_Base]
1347 (p0) movl GR_Expo_Range = 0x0000000000000002 ;;
1348 }
1349 //
1350 // Set Safe to true
1351 // Set Expo_Range = 0 for single
1352 // Set Expo_Range = 2 for double
1353 // Set Expo_Range = 4 for double-extended
1354 //
1355
1356 { .mmi
1357 (p0) shladd GR_Table_Base = GR_Expo_Range,4,GR_Table_Base ;;
1358 (p0) ldfe FR_Threshold = [GR_Table_Base],16
1359 nop.i 999
1360 }
1361
1362 { .mlx
1363 nop.m 999
1364 (p0) movl GR_Bias = 0x000000000000FF9B ;;
1365 }
1366
1367 { .mfi
1368 (p0) ldfe FR_Tiny = [GR_Table_Base],0
1369 nop.f 999
1370 nop.i 999 ;;
1371 }
1372
1373 { .mfi
1374 nop.m 999
1375 (p0) fcmp.gt.unc.s1 p13, p12 = FR_abs_W, FR_Threshold
1376 nop.i 999 ;;
1377 }
1378
1379 { .mfi
1380 nop.m 999
1381 (p13) fnmpy.s1 FR_Y_lo = FR_W, FR_W
1382 nop.i 999
1383 }
1384
1385 { .mfi
1386 nop.m 999
1387 (p13) fadd FR_SCALE = f0, f1
1388 nop.i 999 ;;
1389 }
1390
1391 { .mfi
1392 nop.m 999
1393 (p12) fsub.s1 FR_Y_lo = f0, FR_Tiny
1394 (p12) cmp.ne.unc p7, p0 = r0, r0
1395 }
1396
1397 { .mfi
1398 (p12) setf.exp FR_SCALE = GR_Bias
1399 nop.f 999
1400 nop.i 999 ;;
1401 }
1402
1403 //
1404 // Set p7 to SAFE = FALSE
1405 // Set Scale = 2^-100
1406 //
1407 { .mfb
1408 nop.m 999
1409 (p0) fma.s.s0 FR_Input_X = FR_Y_lo,FR_SCALE,FR_Y_hi
1410 (p0) br.ret.sptk b0
1411 }
1412 ;;
1413
1414 L(LOG_64_one):
1415
1416 { .mfb
1417 nop.m 999
1418 (p0) fmpy.s.s0 FR_Input_X = FR_Input_X, f0
1419 (p0) br.ret.sptk b0
1420 }
1421 ;;
1422 //
1423 // Raise divide by zero for +/-0 input.
1424 //
1425
1426 L(LOG_64_zero):
1427
1428 { .mfi
1429 (p0) mov GR_Parameter_TAG = 142
1430 //
1431 // If we have log1pf(0), return -Inf.
1432 //
1433 (p0) fsub.s0 FR_Output_X_tmp = f0, f1
1434 nop.i 999 ;;
1435 }
1436 { .mfb
1437 nop.m 999
1438 (p0) frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0
1439 (p0) br.cond.sptk L(LOG_ERROR_Support) ;;
1440 }
1441
1442 L(LOG_64_special):
1443
1444 { .mfi
1445 nop.m 999
1446 //
1447 // Return -Inf or value from handler.
1448 //
1449 (p0) fclass.m.unc p7, p0 = FR_Input_X, 0x1E1
1450 nop.i 999 ;;
1451 }
1452
1453 { .mfb
1454 nop.m 999
1455 //
1456 // Check for Natval, QNan, SNaN, +Inf
1457 //
1458 (p7) fmpy.s.s0 f8 = FR_Input_X, f1
1459 //
1460 // For SNaN raise invalid and return QNaN.
1461 // For QNaN raise invalid and return QNaN.
1462 // For +Inf return +Inf.
1463 //
1464 (p7) br.ret.sptk b0
1465 }
1466 ;;
1467
1468 //
1469 // For -Inf raise invalid and return QNaN.
1470 //
1471
1472 { .mfb
1473 (p0) mov GR_Parameter_TAG = 143
1474 (p0) fmpy.s.s0 FR_Output_X_tmp = FR_Input_X, f0
1475 (p0) br.cond.sptk L(LOG_ERROR_Support) ;;
1476 }
1477
1478 //
1479 // Report that log1pf(-Inf) computed
1480 //
1481
1482 L(LOG_64_unsupported):
1483
1484 //
1485 // Return generated NaN or other value .
1486 //
1487
1488 { .mfb
1489 nop.m 999
1490 (p0) fmpy.s.s0 FR_Input_X = FR_Input_X, f0
1491 (p0) br.ret.sptk b0 ;;
1492 }
1493
1494 L(LOG_64_negative):
1495
1496 { .mfi
1497 nop.m 999
1498 //
1499 // Deal with x < 0 in a special way
1500 //
1501 (p0) frcpa.s0 FR_Output_X_tmp, p8 = f0, f0
1502 //
1503 // Deal with x < 0 in a special way - raise
1504 // invalid and produce QNaN indefinite.
1505 //
1506 (p0) mov GR_Parameter_TAG = 143;;
1507 }
1508
1509 .endp log1pf#
1510 ASM_SIZE_DIRECTIVE(log1pf)
1511
1512 .proc __libm_error_region
1513 __libm_error_region:
1514 L(LOG_ERROR_Support):
1515 .prologue
1516
1517 // (1)
1518 { .mfi
1519 add GR_Parameter_Y=-32,sp // Parameter 2 value
1520 nop.f 0
1521 .save ar.pfs,GR_SAVE_PFS
1522 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
1523 }
1524 { .mfi
1525 .fframe 64
1526 add sp=-64,sp // Create new stack
1527 nop.f 0
1528 mov GR_SAVE_GP=gp // Save gp
1529 };;
1530
1531
1532 // (2)
1533 { .mmi
1534 stfs [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
1535 add GR_Parameter_X = 16,sp // Parameter 1 address
1536 .save b0, GR_SAVE_B0
1537 mov GR_SAVE_B0=b0 // Save b0
1538 };;
1539
1540 .body
1541 // (3)
1542 { .mib
1543 stfs [GR_Parameter_X] =FR_Input_X // STORE Parameter 1 on stack
1544 add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
1545 nop.b 0
1546 }
1547 { .mib
1548 stfs [GR_Parameter_Y] = FR_Output_X_tmp // STORE Parameter 3 on stack
1549 add GR_Parameter_Y = -16,GR_Parameter_Y
1550 br.call.sptk b0=__libm_error_support# // Call error handling function
1551 };;
1552 { .mmi
1553 nop.m 0
1554 nop.m 0
1555 add GR_Parameter_RESULT = 48,sp
1556 };;
1557
1558 // (4)
1559 { .mmi
1560 ldfs FR_Input_X = [GR_Parameter_RESULT] // Get return result off stack
1561 .restore sp
1562 add sp = 64,sp // Restore stack pointer
1563 mov b0 = GR_SAVE_B0 // Restore return address
1564 };;
1565 { .mib
1566 mov gp = GR_SAVE_GP // Restore gp
1567 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
1568 br.ret.sptk b0
1569 };;
1570
1571 .endp __libm_error_region
1572 ASM_SIZE_DIRECTIVE(__libm_error_region)
1573
1574
1575 .proc __libm_LOG_main
1576 __libm_LOG_main:
1577 L(LOG_main):
1578
1579 //
1580 // kernel_log_64 computes ln(X + E)
1581 //
1582
1583 { .mfi
1584 nop.m 999
1585 (p7) fadd.s.s0 FR_Input_X = FR_Y_lo,FR_Y_hi
1586 nop.i 999
1587 }
1588
1589 { .mmi
1590 nop.m 999
1591 nop.m 999
1592 (p14) addl GR_Table_Base = @ltoff(Constants_1_by_LN10#),gp ;;
1593 }
1594
1595 { .mmi
1596 nop.m 999
1597 (p14) ld8 GR_Table_Base = [GR_Table_Base]
1598 nop.i 999
1599 };;
1600
1601 { .mmi
1602 (p14) ldfe FR_1LN10_hi = [GR_Table_Base],16 ;;
1603 (p14) ldfe FR_1LN10_lo = [GR_Table_Base]
1604 nop.i 999 ;;
1605 }
1606
1607 { .mfi
1608 nop.m 999
1609 (p14) fmpy.s1 FR_Output_X_tmp = FR_Y_lo,FR_1LN10_hi
1610 nop.i 999 ;;
1611 }
1612
1613 { .mfi
1614 nop.m 999
1615 (p14) fma.s1 FR_Output_X_tmp = FR_Y_hi,FR_1LN10_lo,FR_Output_X_tmp
1616 nop.i 999 ;;
1617 }
1618
1619 { .mfb
1620 nop.m 999
1621 (p14) fma.s.s0 FR_Input_X = FR_Y_hi,FR_1LN10_hi,FR_Output_X_tmp
1622 (p0) br.ret.sptk b0 ;;
1623 }
1624 .endp __libm_LOG_main
1625 ASM_SIZE_DIRECTIVE(__libm_LOG_main)
1626
1627
1628 .type __libm_error_support#,@function
1629 .global __libm_error_support#