]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
54b40531 | 2 | # Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
367ace68 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
bcb43bb3 AP |
9 | |
10 | # ==================================================================== | |
e3713c36 | 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
2e21922e AP |
12 | # project. The module is, however, dual licensed under OpenSSL and |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
bcb43bb3 AP |
15 | # ==================================================================== |
16 | ||
aa2be094 AP |
17 | # October 2005 |
18 | # | |
bcb43bb3 AP |
19 | # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU? |
20 | # Because unlike integer multiplier, which simply stalls whole CPU, | |
21 | # FPU is fully pipelined and can effectively emit 48 bit partial | |
22 | # product every cycle. Why not blended SPARC v9? One can argue that | |
23 | # making this module dependent on UltraSPARC VIS extension limits its | |
a4d729f3 AP |
24 | # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!) |
25 | # implementations from compatibility matrix. But the rest, whole Sun | |
26 | # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support | |
27 | # VIS extension instructions used in this module. This is considered | |
73b979e6 AP |
28 | # good enough to not care about HAL SPARC64 users [if any] who have |
29 | # integer-only pure SPARCv9 module to "fall down" to. | |
bcb43bb3 AP |
30 | |
31 | # USI&II cores currently exhibit uniform 2x improvement [over pre- | |
32 | # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII | |
33 | # performance improves few percents for shorter keys and worsens few | |
aa2be094 | 34 | # percents for longer keys. This is because USIII integer multiplier |
bcb43bb3 AP |
35 | # is >3x faster than USI&II one, which is harder to match [but see |
36 | # TODO list below]. It should also be noted that SPARC64 V features | |
37 | # out-of-order execution, which *might* mean that integer multiplier | |
a4d729f3 AP |
38 | # is pipelined, which in turn *might* be impossible to match... On |
39 | # additional note, SPARC64 V implements FP Multiply-Add instruction, | |
40 | # which is perfectly usable in this context... In other words, as far | |
73b979e6 | 41 | # as Fujitsu SPARC64 V goes, talk to the author:-) |
aa2be094 | 42 | |
a00e414f AP |
43 | # The implementation implies following "non-natural" limitations on |
44 | # input arguments: | |
aa2be094 AP |
45 | # - num may not be less than 4; |
46 | # - num has to be even; | |
aa2be094 AP |
47 | # Failure to meet either condition has no fatal effects, simply |
48 | # doesn't give any performance gain. | |
49 | ||
bcb43bb3 | 50 | # TODO: |
bcb43bb3 AP |
51 | # - modulo-schedule inner loop for better performance (on in-order |
52 | # execution core such as UltraSPARC this shall result in further | |
53 | # noticeable(!) improvement); | |
54 | # - dedicated squaring procedure[?]; | |
55 | ||
2e21922e AP |
56 | ###################################################################### |
57 | # November 2006 | |
58 | # | |
59 | # Modulo-scheduled inner loops allow to interleave floating point and | |
60 | # integer instructions and minimize Read-After-Write penalties. This | |
60250017 | 61 | # results in *further* 20-50% performance improvement [depending on |
2e21922e AP |
62 | # key length, more for longer keys] on USI&II cores and 30-80% - on |
63 | # USIII&IV. | |
64 | ||
1aa89a7a RL |
65 | # $output is the last argument if it looks like a file (it has an extension) |
66 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
67 | ||
68 | $output and open STDOUT,">$output"; | |
6bd7a4d9 | 69 | |
a00e414f | 70 | $fname="bn_mul_mont_fpu"; |
eb77e888 AP |
71 | |
72 | $frame="STACK_FRAME"; | |
73 | $bias="STACK_BIAS"; | |
bcb43bb3 AP |
74 | $locals=64; |
75 | ||
76 | # In order to provide for 32-/64-bit ABI duality, I keep integers wider | |
77 | # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used | |
78 | # exclusively for pointers, indexes and other small values... | |
79 | # int bn_mul_mont( | |
80 | $rp="%i0"; # BN_ULONG *rp, | |
81 | $ap="%i1"; # const BN_ULONG *ap, | |
82 | $bp="%i2"; # const BN_ULONG *bp, | |
83 | $np="%i3"; # const BN_ULONG *np, | |
4d524040 | 84 | $n0="%i4"; # const BN_ULONG *n0, |
bcb43bb3 AP |
85 | $num="%i5"; # int num); |
86 | ||
aa2be094 | 87 | $tp="%l0"; # t[num] |
bcb43bb3 AP |
88 | $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved |
89 | $ap_h="%l2"; # to these four vectors as double-precision FP values. | |
90 | $np_l="%l3"; # This way a bunch of fxtods are eliminated in second | |
91 | $np_h="%l4"; # loop and L1-cache aliasing is minimized... | |
92 | $i="%l5"; | |
93 | $j="%l6"; | |
94 | $mask="%l7"; # 16-bit mask, 0xffff | |
95 | ||
aa2be094 AP |
96 | $n0="%g4"; # reassigned(!) to "64-bit" register |
97 | $carry="%i4"; # %i4 reused(!) for a carry bit | |
bcb43bb3 AP |
98 | |
99 | # FP register naming chart | |
100 | # | |
101 | # ..HILO | |
102 | # dcba | |
103 | # -------- | |
104 | # LOa | |
105 | # LOb | |
106 | # LOc | |
107 | # LOd | |
108 | # HIa | |
109 | # HIb | |
110 | # HIc | |
111 | # HId | |
112 | # ..a | |
113 | # ..b | |
114 | $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6"; | |
115 | $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14"; | |
116 | $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19"; | |
117 | $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23"; | |
118 | ||
119 | $dota="%f24"; $dotb="%f26"; | |
120 | ||
121 | $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38"; | |
122 | $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46"; | |
123 | $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54"; | |
124 | $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; | |
125 | ||
126 | $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load | |
127 | ||
128 | $code=<<___; | |
52f7e44e TM |
129 | #ifndef __ASSEMBLER__ |
130 | # define __ASSEMBLER__ 1 | |
131 | #endif | |
132 | #include "crypto/sparc_arch.h" | |
eb77e888 | 133 | |
bcb43bb3 AP |
134 | .section ".text",#alloc,#execinstr |
135 | ||
136 | .global $fname | |
137 | .align 32 | |
138 | $fname: | |
aa2be094 | 139 | save %sp,-$frame-$locals,%sp |
6df8c74d | 140 | |
aa2be094 AP |
141 | cmp $num,4 |
142 | bl,a,pn %icc,.Lret | |
143 | clr %i0 | |
144 | andcc $num,1,%g0 ! $num has to be even... | |
145 | bnz,a,pn %icc,.Lret | |
146 | clr %i0 ! signal "unsupported input value" | |
760e3535 | 147 | |
aa2be094 | 148 | srl $num,1,$num |
760e3535 | 149 | sethi %hi(0xffff),$mask |
aa2be094 | 150 | ld [%i4+0],$n0 ! $n0 reassigned, remember? |
760e3535 | 151 | or $mask,%lo(0xffff),$mask |
aa2be094 AP |
152 | ld [%i4+4],%o0 |
153 | sllx %o0,32,%o0 | |
154 | or %o0,$n0,$n0 ! $n0=n0[1].n0[0] | |
6df8c74d | 155 | |
aa2be094 | 156 | sll $num,3,$num ! num*=8 |
bcb43bb3 AP |
157 | |
158 | add %sp,$bias,%o0 ! real top of stack | |
159 | sll $num,2,%o1 | |
160 | add %o1,$num,%o1 ! %o1=num*5 | |
161 | sub %o0,%o1,%o0 | |
bcb43bb3 | 162 | and %o0,-2048,%o0 ! optimize TLB utilization |
aa2be094 | 163 | sub %o0,$bias,%sp ! alloca(5*num*8) |
bcb43bb3 | 164 | |
aa2be094 | 165 | rd %asi,%o7 ! save %asi |
bcb43bb3 AP |
166 | add %sp,$bias+$frame+$locals,$tp |
167 | add $tp,$num,$ap_l | |
aa2be094 | 168 | add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends ! |
bcb43bb3 AP |
169 | add $ap_l,$num,$ap_h |
170 | add $ap_h,$num,$np_l | |
171 | add $np_l,$num,$np_h | |
172 | ||
173 | wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads | |
174 | ||
175 | add $rp,$num,$rp ! readjust input pointers to point | |
176 | add $ap,$num,$ap ! at the ends too... | |
177 | add $bp,$num,$bp | |
178 | add $np,$num,$np | |
179 | ||
aa2be094 | 180 | stx %o7,[%sp+$bias+$frame+48] ! save %asi |
bcb43bb3 | 181 | \f |
6df8c74d AP |
182 | sub %g0,$num,$i ! i=-num |
183 | sub %g0,$num,$j ! j=-num | |
bcb43bb3 AP |
184 | |
185 | add $ap,$j,%o3 | |
186 | add $bp,$i,%o4 | |
6df8c74d | 187 | |
87d3af64 AP |
188 | ld [%o3+4],%g1 ! bp[0] |
189 | ld [%o3+0],%o0 | |
190 | ld [%o4+4],%g5 ! ap[0] | |
191 | sllx %g1,32,%g1 | |
192 | ld [%o4+0],%o1 | |
193 | sllx %g5,32,%g5 | |
6df8c74d AP |
194 | or %g1,%o0,%o0 |
195 | or %g5,%o1,%o1 | |
196 | ||
aa2be094 | 197 | add $np,$j,%o5 |
bcb43bb3 AP |
198 | |
199 | mulx %o1,%o0,%o0 ! ap[0]*bp[0] | |
200 | mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0 | |
aa2be094 | 201 | stx %o0,[%sp+$bias+$frame+0] |
bcb43bb3 | 202 | |
6df8c74d | 203 | ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words |
aa2be094 | 204 | fzeros $alo |
6df8c74d | 205 | ld [%o3+4],$ahi_ |
aa2be094 | 206 | fzeros $ahi |
6df8c74d | 207 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words |
aa2be094 | 208 | fzeros $nlo |
6df8c74d | 209 | ld [%o5+4],$nhi_ |
aa2be094 | 210 | fzeros $nhi |
bcb43bb3 AP |
211 | |
212 | ! transfer b[i] to FPU as 4x16-bit values | |
6df8c74d | 213 | ldda [%o4+2]%asi,$ba |
bcb43bb3 | 214 | fxtod $alo,$alo |
6df8c74d | 215 | ldda [%o4+0]%asi,$bb |
bcb43bb3 | 216 | fxtod $ahi,$ahi |
6df8c74d | 217 | ldda [%o4+6]%asi,$bc |
bcb43bb3 | 218 | fxtod $nlo,$nlo |
6df8c74d | 219 | ldda [%o4+4]%asi,$bd |
bcb43bb3 AP |
220 | fxtod $nhi,$nhi |
221 | ||
222 | ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values | |
aa2be094 | 223 | ldda [%sp+$bias+$frame+6]%asi,$na |
bcb43bb3 | 224 | fxtod $ba,$ba |
aa2be094 | 225 | ldda [%sp+$bias+$frame+4]%asi,$nb |
bcb43bb3 | 226 | fxtod $bb,$bb |
aa2be094 | 227 | ldda [%sp+$bias+$frame+2]%asi,$nc |
bcb43bb3 | 228 | fxtod $bc,$bc |
aa2be094 | 229 | ldda [%sp+$bias+$frame+0]%asi,$nd |
bcb43bb3 AP |
230 | fxtod $bd,$bd |
231 | ||
232 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | |
233 | fxtod $na,$na | |
234 | std $ahi,[$ap_h+$j] | |
235 | fxtod $nb,$nb | |
236 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | |
237 | fxtod $nc,$nc | |
238 | std $nhi,[$np_h+$j] | |
239 | fxtod $nd,$nd | |
240 | ||
aa2be094 AP |
241 | fmuld $alo,$ba,$aloa |
242 | fmuld $nlo,$na,$nloa | |
243 | fmuld $alo,$bb,$alob | |
244 | fmuld $nlo,$nb,$nlob | |
245 | fmuld $alo,$bc,$aloc | |
aa2be094 | 246 | faddd $aloa,$nloa,$nloa |
6df8c74d | 247 | fmuld $nlo,$nc,$nloc |
aa2be094 | 248 | fmuld $alo,$bd,$alod |
aa2be094 | 249 | faddd $alob,$nlob,$nlob |
6df8c74d | 250 | fmuld $nlo,$nd,$nlod |
aa2be094 | 251 | fmuld $ahi,$ba,$ahia |
aa2be094 | 252 | faddd $aloc,$nloc,$nloc |
6df8c74d | 253 | fmuld $nhi,$na,$nhia |
aa2be094 | 254 | fmuld $ahi,$bb,$ahib |
aa2be094 | 255 | faddd $alod,$nlod,$nlod |
6df8c74d | 256 | fmuld $nhi,$nb,$nhib |
aa2be094 | 257 | fmuld $ahi,$bc,$ahic |
aa2be094 | 258 | faddd $ahia,$nhia,$nhia |
6df8c74d | 259 | fmuld $nhi,$nc,$nhic |
aa2be094 | 260 | fmuld $ahi,$bd,$ahid |
6df8c74d | 261 | faddd $ahib,$nhib,$nhib |
aa2be094 | 262 | fmuld $nhi,$nd,$nhid |
bcb43bb3 | 263 | |
bcb43bb3 AP |
264 | faddd $ahic,$nhic,$dota ! $nhic |
265 | faddd $ahid,$nhid,$dotb ! $nhid | |
266 | ||
267 | faddd $nloc,$nhia,$nloc | |
268 | faddd $nlod,$nhib,$nlod | |
269 | ||
270 | fdtox $nloa,$nloa | |
271 | fdtox $nlob,$nlob | |
272 | fdtox $nloc,$nloc | |
273 | fdtox $nlod,$nlod | |
274 | ||
275 | std $nloa,[%sp+$bias+$frame+0] | |
2e21922e | 276 | add $j,8,$j |
bcb43bb3 | 277 | std $nlob,[%sp+$bias+$frame+8] |
2e21922e | 278 | add $ap,$j,%o4 |
bcb43bb3 | 279 | std $nloc,[%sp+$bias+$frame+16] |
2e21922e | 280 | add $np,$j,%o5 |
bcb43bb3 | 281 | std $nlod,[%sp+$bias+$frame+24] |
bcb43bb3 | 282 | \f |
1c3d2b94 | 283 | ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words |
aa2be094 | 284 | fzeros $alo |
1c3d2b94 | 285 | ld [%o4+4],$ahi_ |
aa2be094 | 286 | fzeros $ahi |
1c3d2b94 | 287 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words |
aa2be094 | 288 | fzeros $nlo |
1c3d2b94 | 289 | ld [%o5+4],$nhi_ |
aa2be094 | 290 | fzeros $nhi |
bcb43bb3 AP |
291 | |
292 | fxtod $alo,$alo | |
293 | fxtod $ahi,$ahi | |
294 | fxtod $nlo,$nlo | |
295 | fxtod $nhi,$nhi | |
296 | ||
2e21922e | 297 | ldx [%sp+$bias+$frame+0],%o0 |
aa2be094 | 298 | fmuld $alo,$ba,$aloa |
2e21922e | 299 | ldx [%sp+$bias+$frame+8],%o1 |
aa2be094 | 300 | fmuld $nlo,$na,$nloa |
2e21922e | 301 | ldx [%sp+$bias+$frame+16],%o2 |
aa2be094 | 302 | fmuld $alo,$bb,$alob |
2e21922e | 303 | ldx [%sp+$bias+$frame+24],%o3 |
aa2be094 | 304 | fmuld $nlo,$nb,$nlob |
2e21922e AP |
305 | |
306 | srlx %o0,16,%o7 | |
307 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | |
aa2be094 | 308 | fmuld $alo,$bc,$aloc |
2e21922e AP |
309 | add %o7,%o1,%o1 |
310 | std $ahi,[$ap_h+$j] | |
311 | faddd $aloa,$nloa,$nloa | |
6df8c74d | 312 | fmuld $nlo,$nc,$nloc |
2e21922e AP |
313 | srlx %o1,16,%o7 |
314 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | |
aa2be094 | 315 | fmuld $alo,$bd,$alod |
2e21922e AP |
316 | add %o7,%o2,%o2 |
317 | std $nhi,[$np_h+$j] | |
318 | faddd $alob,$nlob,$nlob | |
6df8c74d | 319 | fmuld $nlo,$nd,$nlod |
2e21922e | 320 | srlx %o2,16,%o7 |
aa2be094 | 321 | fmuld $ahi,$ba,$ahia |
2e21922e AP |
322 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] |
323 | faddd $aloc,$nloc,$nloc | |
6df8c74d | 324 | fmuld $nhi,$na,$nhia |
2e21922e AP |
325 | !and %o0,$mask,%o0 |
326 | !and %o1,$mask,%o1 | |
327 | !and %o2,$mask,%o2 | |
328 | !sllx %o1,16,%o1 | |
329 | !sllx %o2,32,%o2 | |
330 | !sllx %o3,48,%o7 | |
331 | !or %o1,%o0,%o0 | |
332 | !or %o2,%o0,%o0 | |
333 | !or %o7,%o0,%o0 ! 64-bit result | |
334 | srlx %o3,16,%g1 ! 34-bit carry | |
aa2be094 | 335 | fmuld $ahi,$bb,$ahib |
2e21922e | 336 | |
aa2be094 | 337 | faddd $alod,$nlod,$nlod |
6df8c74d | 338 | fmuld $nhi,$nb,$nhib |
aa2be094 | 339 | fmuld $ahi,$bc,$ahic |
aa2be094 | 340 | faddd $ahia,$nhia,$nhia |
6df8c74d | 341 | fmuld $nhi,$nc,$nhic |
aa2be094 | 342 | fmuld $ahi,$bd,$ahid |
aa2be094 | 343 | faddd $ahib,$nhib,$nhib |
6df8c74d | 344 | fmuld $nhi,$nd,$nhid |
bcb43bb3 AP |
345 | |
346 | faddd $dota,$nloa,$nloa | |
347 | faddd $dotb,$nlob,$nlob | |
348 | faddd $ahic,$nhic,$dota ! $nhic | |
349 | faddd $ahid,$nhid,$dotb ! $nhid | |
350 | ||
351 | faddd $nloc,$nhia,$nloc | |
352 | faddd $nlod,$nhib,$nlod | |
353 | ||
354 | fdtox $nloa,$nloa | |
355 | fdtox $nlob,$nlob | |
356 | fdtox $nloc,$nloc | |
357 | fdtox $nlod,$nlod | |
358 | ||
359 | std $nloa,[%sp+$bias+$frame+0] | |
360 | std $nlob,[%sp+$bias+$frame+8] | |
2e21922e | 361 | addcc $j,8,$j |
bcb43bb3 | 362 | std $nloc,[%sp+$bias+$frame+16] |
2e21922e | 363 | bz,pn %icc,.L1stskip |
bcb43bb3 | 364 | std $nlod,[%sp+$bias+$frame+24] |
1c3d2b94 | 365 | \f |
23296942 | 366 | .align 32 ! incidentally already aligned ! |
1c3d2b94 | 367 | .L1st: |
1c3d2b94 AP |
368 | add $ap,$j,%o4 |
369 | add $np,$j,%o5 | |
370 | ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words | |
371 | fzeros $alo | |
372 | ld [%o4+4],$ahi_ | |
373 | fzeros $ahi | |
374 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | |
375 | fzeros $nlo | |
376 | ld [%o5+4],$nhi_ | |
377 | fzeros $nhi | |
378 | ||
379 | fxtod $alo,$alo | |
380 | fxtod $ahi,$ahi | |
381 | fxtod $nlo,$nlo | |
382 | fxtod $nhi,$nhi | |
383 | ||
2e21922e | 384 | ldx [%sp+$bias+$frame+0],%o0 |
1c3d2b94 | 385 | fmuld $alo,$ba,$aloa |
2e21922e | 386 | ldx [%sp+$bias+$frame+8],%o1 |
1c3d2b94 | 387 | fmuld $nlo,$na,$nloa |
2e21922e | 388 | ldx [%sp+$bias+$frame+16],%o2 |
1c3d2b94 | 389 | fmuld $alo,$bb,$alob |
2e21922e | 390 | ldx [%sp+$bias+$frame+24],%o3 |
1c3d2b94 | 391 | fmuld $nlo,$nb,$nlob |
2e21922e AP |
392 | |
393 | srlx %o0,16,%o7 | |
394 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | |
1c3d2b94 | 395 | fmuld $alo,$bc,$aloc |
2e21922e AP |
396 | add %o7,%o1,%o1 |
397 | std $ahi,[$ap_h+$j] | |
398 | faddd $aloa,$nloa,$nloa | |
1c3d2b94 | 399 | fmuld $nlo,$nc,$nloc |
2e21922e AP |
400 | srlx %o1,16,%o7 |
401 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | |
1c3d2b94 | 402 | fmuld $alo,$bd,$alod |
2e21922e AP |
403 | add %o7,%o2,%o2 |
404 | std $nhi,[$np_h+$j] | |
405 | faddd $alob,$nlob,$nlob | |
1c3d2b94 | 406 | fmuld $nlo,$nd,$nlod |
2e21922e | 407 | srlx %o2,16,%o7 |
1c3d2b94 | 408 | fmuld $ahi,$ba,$ahia |
2e21922e AP |
409 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] |
410 | and %o0,$mask,%o0 | |
411 | faddd $aloc,$nloc,$nloc | |
1c3d2b94 | 412 | fmuld $nhi,$na,$nhia |
2e21922e AP |
413 | and %o1,$mask,%o1 |
414 | and %o2,$mask,%o2 | |
1c3d2b94 | 415 | fmuld $ahi,$bb,$ahib |
2e21922e AP |
416 | sllx %o1,16,%o1 |
417 | faddd $alod,$nlod,$nlod | |
1c3d2b94 | 418 | fmuld $nhi,$nb,$nhib |
2e21922e | 419 | sllx %o2,32,%o2 |
1c3d2b94 | 420 | fmuld $ahi,$bc,$ahic |
2e21922e AP |
421 | sllx %o3,48,%o7 |
422 | or %o1,%o0,%o0 | |
423 | faddd $ahia,$nhia,$nhia | |
1c3d2b94 | 424 | fmuld $nhi,$nc,$nhic |
2e21922e | 425 | or %o2,%o0,%o0 |
1c3d2b94 | 426 | fmuld $ahi,$bd,$ahid |
2e21922e AP |
427 | or %o7,%o0,%o0 ! 64-bit result |
428 | faddd $ahib,$nhib,$nhib | |
1c3d2b94 | 429 | fmuld $nhi,$nd,$nhid |
2e21922e AP |
430 | addcc %g1,%o0,%o0 |
431 | faddd $dota,$nloa,$nloa | |
432 | srlx %o3,16,%g1 ! 34-bit carry | |
433 | faddd $dotb,$nlob,$nlob | |
434 | bcs,a %xcc,.+8 | |
435 | add %g1,1,%g1 | |
436 | ||
437 | stx %o0,[$tp] ! tp[j-1]= | |
1c3d2b94 | 438 | |
1c3d2b94 AP |
439 | faddd $ahic,$nhic,$dota ! $nhic |
440 | faddd $ahid,$nhid,$dotb ! $nhid | |
441 | ||
442 | faddd $nloc,$nhia,$nloc | |
443 | faddd $nlod,$nhib,$nlod | |
444 | ||
445 | fdtox $nloa,$nloa | |
446 | fdtox $nlob,$nlob | |
447 | fdtox $nloc,$nloc | |
448 | fdtox $nlod,$nlod | |
449 | ||
450 | std $nloa,[%sp+$bias+$frame+0] | |
451 | std $nlob,[%sp+$bias+$frame+8] | |
452 | std $nloc,[%sp+$bias+$frame+16] | |
453 | std $nlod,[%sp+$bias+$frame+24] | |
454 | ||
aa2be094 AP |
455 | addcc $j,8,$j |
456 | bnz,pt %icc,.L1st | |
bcb43bb3 | 457 | add $tp,8,$tp |
1c3d2b94 AP |
458 | \f |
459 | .L1stskip: | |
ebae8092 AP |
460 | fdtox $dota,$dota |
461 | fdtox $dotb,$dotb | |
462 | ||
1c3d2b94 AP |
463 | ldx [%sp+$bias+$frame+0],%o0 |
464 | ldx [%sp+$bias+$frame+8],%o1 | |
465 | ldx [%sp+$bias+$frame+16],%o2 | |
466 | ldx [%sp+$bias+$frame+24],%o3 | |
467 | ||
468 | srlx %o0,16,%o7 | |
ebae8092 | 469 | std $dota,[%sp+$bias+$frame+32] |
1c3d2b94 | 470 | add %o7,%o1,%o1 |
ebae8092 | 471 | std $dotb,[%sp+$bias+$frame+40] |
1c3d2b94 AP |
472 | srlx %o1,16,%o7 |
473 | add %o7,%o2,%o2 | |
474 | srlx %o2,16,%o7 | |
475 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | |
476 | and %o0,$mask,%o0 | |
477 | and %o1,$mask,%o1 | |
478 | and %o2,$mask,%o2 | |
479 | sllx %o1,16,%o1 | |
480 | sllx %o2,32,%o2 | |
481 | sllx %o3,48,%o7 | |
482 | or %o1,%o0,%o0 | |
483 | or %o2,%o0,%o0 | |
484 | or %o7,%o0,%o0 ! 64-bit result | |
ebae8092 | 485 | ldx [%sp+$bias+$frame+32],%o4 |
1c3d2b94 | 486 | addcc %g1,%o0,%o0 |
ebae8092 | 487 | ldx [%sp+$bias+$frame+40],%o5 |
1c3d2b94 AP |
488 | srlx %o3,16,%g1 ! 34-bit carry |
489 | bcs,a %xcc,.+8 | |
490 | add %g1,1,%g1 | |
491 | ||
492 | stx %o0,[$tp] ! tp[j-1]= | |
493 | add $tp,8,$tp | |
bcb43bb3 | 494 | |
ebae8092 AP |
495 | srlx %o4,16,%o7 |
496 | add %o7,%o5,%o5 | |
497 | and %o4,$mask,%o4 | |
498 | sllx %o5,16,%o7 | |
499 | or %o7,%o4,%o4 | |
500 | addcc %g1,%o4,%o4 | |
501 | srlx %o5,48,%g1 | |
bcb43bb3 AP |
502 | bcs,a %xcc,.+8 |
503 | add %g1,1,%g1 | |
504 | ||
505 | mov %g1,$carry | |
ebae8092 | 506 | stx %o4,[$tp] ! tp[num-1]= |
bcb43bb3 AP |
507 | \f |
508 | ba .Louter | |
509 | add $i,8,$i | |
510 | .align 32 | |
511 | .Louter: | |
6df8c74d | 512 | sub %g0,$num,$j ! j=-num |
bcb43bb3 AP |
513 | add %sp,$bias+$frame+$locals,$tp |
514 | ||
87d3af64 | 515 | add $ap,$j,%o3 |
bcb43bb3 | 516 | add $bp,$i,%o4 |
6df8c74d | 517 | |
87d3af64 AP |
518 | ld [%o3+4],%g1 ! bp[i] |
519 | ld [%o3+0],%o0 | |
520 | ld [%o4+4],%g5 ! ap[0] | |
521 | sllx %g1,32,%g1 | |
522 | ld [%o4+0],%o1 | |
523 | sllx %g5,32,%g5 | |
6df8c74d AP |
524 | or %g1,%o0,%o0 |
525 | or %g5,%o1,%o1 | |
526 | ||
bcb43bb3 AP |
527 | ldx [$tp],%o2 ! tp[0] |
528 | mulx %o1,%o0,%o0 | |
529 | addcc %o2,%o0,%o0 | |
530 | mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 | |
aa2be094 | 531 | stx %o0,[%sp+$bias+$frame+0] |
bcb43bb3 | 532 | |
bcb43bb3 | 533 | ! transfer b[i] to FPU as 4x16-bit values |
6df8c74d AP |
534 | ldda [%o4+2]%asi,$ba |
535 | ldda [%o4+0]%asi,$bb | |
536 | ldda [%o4+6]%asi,$bc | |
537 | ldda [%o4+4]%asi,$bd | |
bcb43bb3 AP |
538 | |
539 | ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values | |
aa2be094 | 540 | ldda [%sp+$bias+$frame+6]%asi,$na |
bcb43bb3 | 541 | fxtod $ba,$ba |
aa2be094 | 542 | ldda [%sp+$bias+$frame+4]%asi,$nb |
bcb43bb3 | 543 | fxtod $bb,$bb |
aa2be094 | 544 | ldda [%sp+$bias+$frame+2]%asi,$nc |
bcb43bb3 | 545 | fxtod $bc,$bc |
aa2be094 | 546 | ldda [%sp+$bias+$frame+0]%asi,$nd |
bcb43bb3 AP |
547 | fxtod $bd,$bd |
548 | ldd [$ap_l+$j],$alo ! load a[j] in double format | |
549 | fxtod $na,$na | |
550 | ldd [$ap_h+$j],$ahi | |
551 | fxtod $nb,$nb | |
552 | ldd [$np_l+$j],$nlo ! load n[j] in double format | |
553 | fxtod $nc,$nc | |
554 | ldd [$np_h+$j],$nhi | |
555 | fxtod $nd,$nd | |
556 | ||
aa2be094 AP |
557 | fmuld $alo,$ba,$aloa |
558 | fmuld $nlo,$na,$nloa | |
559 | fmuld $alo,$bb,$alob | |
560 | fmuld $nlo,$nb,$nlob | |
561 | fmuld $alo,$bc,$aloc | |
aa2be094 | 562 | faddd $aloa,$nloa,$nloa |
6df8c74d | 563 | fmuld $nlo,$nc,$nloc |
aa2be094 | 564 | fmuld $alo,$bd,$alod |
aa2be094 | 565 | faddd $alob,$nlob,$nlob |
6df8c74d | 566 | fmuld $nlo,$nd,$nlod |
aa2be094 | 567 | fmuld $ahi,$ba,$ahia |
aa2be094 | 568 | faddd $aloc,$nloc,$nloc |
6df8c74d | 569 | fmuld $nhi,$na,$nhia |
aa2be094 | 570 | fmuld $ahi,$bb,$ahib |
aa2be094 | 571 | faddd $alod,$nlod,$nlod |
6df8c74d | 572 | fmuld $nhi,$nb,$nhib |
aa2be094 | 573 | fmuld $ahi,$bc,$ahic |
aa2be094 | 574 | faddd $ahia,$nhia,$nhia |
6df8c74d | 575 | fmuld $nhi,$nc,$nhic |
aa2be094 | 576 | fmuld $ahi,$bd,$ahid |
6df8c74d | 577 | faddd $ahib,$nhib,$nhib |
aa2be094 | 578 | fmuld $nhi,$nd,$nhid |
bcb43bb3 | 579 | |
bcb43bb3 AP |
580 | faddd $ahic,$nhic,$dota ! $nhic |
581 | faddd $ahid,$nhid,$dotb ! $nhid | |
582 | ||
583 | faddd $nloc,$nhia,$nloc | |
584 | faddd $nlod,$nhib,$nlod | |
585 | ||
586 | fdtox $nloa,$nloa | |
587 | fdtox $nlob,$nlob | |
588 | fdtox $nloc,$nloc | |
589 | fdtox $nlod,$nlod | |
590 | ||
591 | std $nloa,[%sp+$bias+$frame+0] | |
592 | std $nlob,[%sp+$bias+$frame+8] | |
593 | std $nloc,[%sp+$bias+$frame+16] | |
2e21922e | 594 | add $j,8,$j |
bcb43bb3 | 595 | std $nlod,[%sp+$bias+$frame+24] |
2e21922e AP |
596 | \f |
597 | ldd [$ap_l+$j],$alo ! load a[j] in double format | |
598 | ldd [$ap_h+$j],$ahi | |
599 | ldd [$np_l+$j],$nlo ! load n[j] in double format | |
600 | ldd [$np_h+$j],$nhi | |
601 | ||
602 | fmuld $alo,$ba,$aloa | |
603 | fmuld $nlo,$na,$nloa | |
604 | fmuld $alo,$bb,$alob | |
605 | fmuld $nlo,$nb,$nlob | |
606 | fmuld $alo,$bc,$aloc | |
bcb43bb3 | 607 | ldx [%sp+$bias+$frame+0],%o0 |
2e21922e AP |
608 | faddd $aloa,$nloa,$nloa |
609 | fmuld $nlo,$nc,$nloc | |
bcb43bb3 | 610 | ldx [%sp+$bias+$frame+8],%o1 |
2e21922e | 611 | fmuld $alo,$bd,$alod |
bcb43bb3 | 612 | ldx [%sp+$bias+$frame+16],%o2 |
2e21922e AP |
613 | faddd $alob,$nlob,$nlob |
614 | fmuld $nlo,$nd,$nlod | |
bcb43bb3 | 615 | ldx [%sp+$bias+$frame+24],%o3 |
2e21922e | 616 | fmuld $ahi,$ba,$ahia |
bcb43bb3 AP |
617 | |
618 | srlx %o0,16,%o7 | |
2e21922e AP |
619 | faddd $aloc,$nloc,$nloc |
620 | fmuld $nhi,$na,$nhia | |
bcb43bb3 | 621 | add %o7,%o1,%o1 |
2e21922e | 622 | fmuld $ahi,$bb,$ahib |
bcb43bb3 | 623 | srlx %o1,16,%o7 |
2e21922e AP |
624 | faddd $alod,$nlod,$nlod |
625 | fmuld $nhi,$nb,$nhib | |
bcb43bb3 | 626 | add %o7,%o2,%o2 |
2e21922e | 627 | fmuld $ahi,$bc,$ahic |
bcb43bb3 | 628 | srlx %o2,16,%o7 |
2e21922e AP |
629 | faddd $ahia,$nhia,$nhia |
630 | fmuld $nhi,$nc,$nhic | |
bcb43bb3 AP |
631 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] |
632 | ! why? | |
633 | and %o0,$mask,%o0 | |
2e21922e | 634 | fmuld $ahi,$bd,$ahid |
bcb43bb3 AP |
635 | and %o1,$mask,%o1 |
636 | and %o2,$mask,%o2 | |
2e21922e AP |
637 | faddd $ahib,$nhib,$nhib |
638 | fmuld $nhi,$nd,$nhid | |
bcb43bb3 | 639 | sllx %o1,16,%o1 |
2e21922e | 640 | faddd $dota,$nloa,$nloa |
bcb43bb3 | 641 | sllx %o2,32,%o2 |
2e21922e | 642 | faddd $dotb,$nlob,$nlob |
bcb43bb3 AP |
643 | sllx %o3,48,%o7 |
644 | or %o1,%o0,%o0 | |
2e21922e | 645 | faddd $ahic,$nhic,$dota ! $nhic |
bcb43bb3 | 646 | or %o2,%o0,%o0 |
2e21922e | 647 | faddd $ahid,$nhid,$dotb ! $nhid |
bcb43bb3 AP |
648 | or %o7,%o0,%o0 ! 64-bit result |
649 | ldx [$tp],%o7 | |
2e21922e | 650 | faddd $nloc,$nhia,$nloc |
bcb43bb3 AP |
651 | addcc %o7,%o0,%o0 |
652 | ! end-of-why? | |
2e21922e | 653 | faddd $nlod,$nhib,$nlod |
bcb43bb3 | 654 | srlx %o3,16,%g1 ! 34-bit carry |
2e21922e | 655 | fdtox $nloa,$nloa |
bcb43bb3 AP |
656 | bcs,a %xcc,.+8 |
657 | add %g1,1,%g1 | |
bcb43bb3 | 658 | |
bcb43bb3 AP |
659 | fdtox $nlob,$nlob |
660 | fdtox $nloc,$nloc | |
661 | fdtox $nlod,$nlod | |
662 | ||
663 | std $nloa,[%sp+$bias+$frame+0] | |
664 | std $nlob,[%sp+$bias+$frame+8] | |
2e21922e | 665 | addcc $j,8,$j |
bcb43bb3 | 666 | std $nloc,[%sp+$bias+$frame+16] |
2e21922e | 667 | bz,pn %icc,.Linnerskip |
bcb43bb3 | 668 | std $nlod,[%sp+$bias+$frame+24] |
1c3d2b94 | 669 | \f |
ebae8092 AP |
670 | ba .Linner |
671 | nop | |
672 | .align 32 | |
1c3d2b94 | 673 | .Linner: |
2e21922e AP |
674 | ldd [$ap_l+$j],$alo ! load a[j] in double format |
675 | ldd [$ap_h+$j],$ahi | |
676 | ldd [$np_l+$j],$nlo ! load n[j] in double format | |
677 | ldd [$np_h+$j],$nhi | |
678 | ||
679 | fmuld $alo,$ba,$aloa | |
680 | fmuld $nlo,$na,$nloa | |
681 | fmuld $alo,$bb,$alob | |
682 | fmuld $nlo,$nb,$nlob | |
683 | fmuld $alo,$bc,$aloc | |
bcb43bb3 | 684 | ldx [%sp+$bias+$frame+0],%o0 |
2e21922e AP |
685 | faddd $aloa,$nloa,$nloa |
686 | fmuld $nlo,$nc,$nloc | |
bcb43bb3 | 687 | ldx [%sp+$bias+$frame+8],%o1 |
2e21922e | 688 | fmuld $alo,$bd,$alod |
bcb43bb3 | 689 | ldx [%sp+$bias+$frame+16],%o2 |
2e21922e AP |
690 | faddd $alob,$nlob,$nlob |
691 | fmuld $nlo,$nd,$nlod | |
bcb43bb3 | 692 | ldx [%sp+$bias+$frame+24],%o3 |
2e21922e | 693 | fmuld $ahi,$ba,$ahia |
bcb43bb3 AP |
694 | |
695 | srlx %o0,16,%o7 | |
2e21922e AP |
696 | faddd $aloc,$nloc,$nloc |
697 | fmuld $nhi,$na,$nhia | |
bcb43bb3 | 698 | add %o7,%o1,%o1 |
2e21922e | 699 | fmuld $ahi,$bb,$ahib |
bcb43bb3 | 700 | srlx %o1,16,%o7 |
2e21922e AP |
701 | faddd $alod,$nlod,$nlod |
702 | fmuld $nhi,$nb,$nhib | |
bcb43bb3 | 703 | add %o7,%o2,%o2 |
2e21922e | 704 | fmuld $ahi,$bc,$ahic |
bcb43bb3 | 705 | srlx %o2,16,%o7 |
2e21922e AP |
706 | faddd $ahia,$nhia,$nhia |
707 | fmuld $nhi,$nc,$nhic | |
bcb43bb3 AP |
708 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] |
709 | and %o0,$mask,%o0 | |
2e21922e | 710 | fmuld $ahi,$bd,$ahid |
bcb43bb3 AP |
711 | and %o1,$mask,%o1 |
712 | and %o2,$mask,%o2 | |
2e21922e AP |
713 | faddd $ahib,$nhib,$nhib |
714 | fmuld $nhi,$nd,$nhid | |
bcb43bb3 | 715 | sllx %o1,16,%o1 |
2e21922e | 716 | faddd $dota,$nloa,$nloa |
bcb43bb3 | 717 | sllx %o2,32,%o2 |
2e21922e | 718 | faddd $dotb,$nlob,$nlob |
bcb43bb3 AP |
719 | sllx %o3,48,%o7 |
720 | or %o1,%o0,%o0 | |
2e21922e | 721 | faddd $ahic,$nhic,$dota ! $nhic |
bcb43bb3 | 722 | or %o2,%o0,%o0 |
2e21922e | 723 | faddd $ahid,$nhid,$dotb ! $nhid |
bcb43bb3 | 724 | or %o7,%o0,%o0 ! 64-bit result |
2e21922e | 725 | faddd $nloc,$nhia,$nloc |
bcb43bb3 | 726 | addcc %g1,%o0,%o0 |
ebae8092 | 727 | ldx [$tp+8],%o7 ! tp[j] |
2e21922e | 728 | faddd $nlod,$nhib,$nlod |
bcb43bb3 | 729 | srlx %o3,16,%g1 ! 34-bit carry |
2e21922e | 730 | fdtox $nloa,$nloa |
bcb43bb3 AP |
731 | bcs,a %xcc,.+8 |
732 | add %g1,1,%g1 | |
2e21922e | 733 | fdtox $nlob,$nlob |
bcb43bb3 | 734 | addcc %o7,%o0,%o0 |
2e21922e | 735 | fdtox $nloc,$nloc |
bcb43bb3 AP |
736 | bcs,a %xcc,.+8 |
737 | add %g1,1,%g1 | |
738 | ||
739 | stx %o0,[$tp] ! tp[j-1] | |
2e21922e | 740 | fdtox $nlod,$nlod |
1c3d2b94 AP |
741 | |
742 | std $nloa,[%sp+$bias+$frame+0] | |
743 | std $nlob,[%sp+$bias+$frame+8] | |
744 | std $nloc,[%sp+$bias+$frame+16] | |
aa2be094 | 745 | addcc $j,8,$j |
2e21922e | 746 | std $nlod,[%sp+$bias+$frame+24] |
aa2be094 | 747 | bnz,pt %icc,.Linner |
bcb43bb3 | 748 | add $tp,8,$tp |
1c3d2b94 AP |
749 | \f |
750 | .Linnerskip: | |
2e21922e AP |
751 | fdtox $dota,$dota |
752 | fdtox $dotb,$dotb | |
753 | ||
1c3d2b94 AP |
754 | ldx [%sp+$bias+$frame+0],%o0 |
755 | ldx [%sp+$bias+$frame+8],%o1 | |
756 | ldx [%sp+$bias+$frame+16],%o2 | |
757 | ldx [%sp+$bias+$frame+24],%o3 | |
758 | ||
759 | srlx %o0,16,%o7 | |
2e21922e | 760 | std $dota,[%sp+$bias+$frame+32] |
1c3d2b94 | 761 | add %o7,%o1,%o1 |
2e21922e | 762 | std $dotb,[%sp+$bias+$frame+40] |
1c3d2b94 AP |
763 | srlx %o1,16,%o7 |
764 | add %o7,%o2,%o2 | |
765 | srlx %o2,16,%o7 | |
766 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | |
767 | and %o0,$mask,%o0 | |
768 | and %o1,$mask,%o1 | |
769 | and %o2,$mask,%o2 | |
770 | sllx %o1,16,%o1 | |
771 | sllx %o2,32,%o2 | |
772 | sllx %o3,48,%o7 | |
773 | or %o1,%o0,%o0 | |
774 | or %o2,%o0,%o0 | |
2e21922e | 775 | ldx [%sp+$bias+$frame+32],%o4 |
1c3d2b94 | 776 | or %o7,%o0,%o0 ! 64-bit result |
2e21922e | 777 | ldx [%sp+$bias+$frame+40],%o5 |
1c3d2b94 | 778 | addcc %g1,%o0,%o0 |
2e21922e | 779 | ldx [$tp+8],%o7 ! tp[j] |
1c3d2b94 AP |
780 | srlx %o3,16,%g1 ! 34-bit carry |
781 | bcs,a %xcc,.+8 | |
782 | add %g1,1,%g1 | |
783 | ||
1c3d2b94 AP |
784 | addcc %o7,%o0,%o0 |
785 | bcs,a %xcc,.+8 | |
786 | add %g1,1,%g1 | |
787 | ||
788 | stx %o0,[$tp] ! tp[j-1] | |
789 | add $tp,8,$tp | |
bcb43bb3 | 790 | |
2e21922e AP |
791 | srlx %o4,16,%o7 |
792 | add %o7,%o5,%o5 | |
793 | and %o4,$mask,%o4 | |
794 | sllx %o5,16,%o7 | |
795 | or %o7,%o4,%o4 | |
796 | addcc %g1,%o4,%o4 | |
797 | srlx %o5,48,%g1 | |
bcb43bb3 AP |
798 | bcs,a %xcc,.+8 |
799 | add %g1,1,%g1 | |
800 | ||
2e21922e AP |
801 | addcc $carry,%o4,%o4 |
802 | stx %o4,[$tp] ! tp[num-1] | |
bcb43bb3 AP |
803 | mov %g1,$carry |
804 | bcs,a %xcc,.+8 | |
805 | add $carry,1,$carry | |
806 | ||
aa2be094 AP |
807 | addcc $i,8,$i |
808 | bnz %icc,.Louter | |
bcb43bb3 AP |
809 | nop |
810 | \f | |
7d9cf7c0 | 811 | add $tp,8,$tp ! adjust tp to point at the end |
7d9cf7c0 | 812 | orn %g0,%g0,%g4 |
7d9cf7c0 | 813 | sub %g0,$num,%o7 ! n=-num |
23296942 | 814 | ba .Lsub |
673c55a2 | 815 | subcc %g0,%g0,%g0 ! clear %icc.c |
23296942 AP |
816 | |
817 | .align 32 | |
bcb43bb3 | 818 | .Lsub: |
87d3af64 AP |
819 | ldx [$tp+%o7],%o0 |
820 | add $np,%o7,%g1 | |
821 | ld [%g1+0],%o2 | |
822 | ld [%g1+4],%o3 | |
823 | srlx %o0,32,%o1 | |
824 | subccc %o0,%o2,%o2 | |
825 | add $rp,%o7,%g1 | |
826 | subccc %o1,%o3,%o3 | |
827 | st %o2,[%g1+0] | |
aa2be094 AP |
828 | add %o7,8,%o7 |
829 | brnz,pt %o7,.Lsub | |
87d3af64 | 830 | st %o3,[%g1+4] |
7d9cf7c0 | 831 | subc $carry,0,%g4 |
6df8c74d | 832 | sub %g0,$num,%o7 ! n=-num |
23296942 AP |
833 | ba .Lcopy |
834 | nop | |
bcb43bb3 | 835 | |
23296942 | 836 | .align 32 |
bcb43bb3 | 837 | .Lcopy: |
aa2be094 | 838 | ldx [$tp+%o7],%o0 |
87d3af64 | 839 | add $rp,%o7,%g1 |
7d9cf7c0 AP |
840 | ld [%g1+0],%o2 |
841 | ld [%g1+4],%o3 | |
842 | stx %g0,[$tp+%o7] | |
843 | and %o0,%g4,%o0 | |
844 | srlx %o0,32,%o1 | |
845 | andn %o2,%g4,%o2 | |
846 | andn %o3,%g4,%o3 | |
847 | or %o2,%o0,%o0 | |
848 | or %o3,%o1,%o1 | |
87d3af64 | 849 | st %o0,[%g1+0] |
aa2be094 AP |
850 | add %o7,8,%o7 |
851 | brnz,pt %o7,.Lcopy | |
87d3af64 | 852 | st %o1,[%g1+4] |
6df8c74d | 853 | sub %g0,$num,%o7 ! n=-num |
bcb43bb3 | 854 | |
bcb43bb3 | 855 | .Lzap: |
aa2be094 AP |
856 | stx %g0,[$ap_l+%o7] |
857 | stx %g0,[$ap_h+%o7] | |
858 | stx %g0,[$np_l+%o7] | |
859 | stx %g0,[$np_h+%o7] | |
860 | add %o7,8,%o7 | |
861 | brnz,pt %o7,.Lzap | |
bcb43bb3 AP |
862 | nop |
863 | ||
864 | ldx [%sp+$bias+$frame+48],%o7 | |
865 | wr %g0,%o7,%asi ! restore %asi | |
866 | ||
867 | mov 1,%i0 | |
aa2be094 | 868 | .Lret: |
bcb43bb3 AP |
869 | ret |
870 | restore | |
871 | .type $fname,#function | |
872 | .size $fname,(.-$fname) | |
46f4e1be | 873 | .asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>" |
23296942 | 874 | .align 32 |
bcb43bb3 AP |
875 | ___ |
876 | ||
877 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | |
3b4a0225 AP |
878 | |
879 | # Below substitution makes it possible to compile without demanding | |
478b50cf | 880 | # VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I |
3b4a0225 AP |
881 | # dare to do this, because VIS capability is detected at run-time now |
882 | # and this routine is not called on CPU not capable to execute it. Do | |
883 | # note that fzeros is not the only VIS dependency! Another dependency | |
884 | # is implicit and is just _a_ numerical value loaded to %asi register, | |
885 | # which assembler can't recognize as VIS specific... | |
886 | $code =~ s/fzeros\s+%f([0-9]+)/ | |
887 | sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1) | |
888 | /gem; | |
889 | ||
bcb43bb3 | 890 | print $code; |
3b4a0225 | 891 | # flush |
a21314db | 892 | close STDOUT or die "error closing STDOUT: $!"; |