]>
Commit | Line | Data |
---|---|---|
bcb43bb3 AP |
1 | #!/usr/bin/env perl |
2 | ||
3 | # ==================================================================== | |
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
2e21922e AP |
5 | # project. The module is, however, dual licensed under OpenSSL and |
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | # details see http://www.openssl.org/~appro/cryptogams/. | |
bcb43bb3 AP |
8 | # ==================================================================== |
9 | ||
aa2be094 AP |
10 | # October 2005 |
11 | # | |
bcb43bb3 AP |
12 | # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU? |
13 | # Because unlike integer multiplier, which simply stalls whole CPU, | |
14 | # FPU is fully pipelined and can effectively emit 48 bit partial | |
15 | # product every cycle. Why not blended SPARC v9? One can argue that | |
16 | # making this module dependent on UltraSPARC VIS extension limits its | |
a4d729f3 AP |
17 | # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!) |
18 | # implementations from compatibility matrix. But the rest, whole Sun | |
19 | # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support | |
20 | # VIS extension instructions used in this module. This is considered | |
73b979e6 AP |
21 | # good enough to not care about HAL SPARC64 users [if any] who have |
22 | # integer-only pure SPARCv9 module to "fall down" to. | |
bcb43bb3 AP |
23 | |
24 | # USI&II cores currently exhibit uniform 2x improvement [over pre- | |
25 | # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII | |
26 | # performance improves few percents for shorter keys and worsens few | |
aa2be094 | 27 | # percents for longer keys. This is because USIII integer multiplier |
bcb43bb3 AP |
28 | # is >3x faster than USI&II one, which is harder to match [but see |
29 | # TODO list below]. It should also be noted that SPARC64 V features | |
30 | # out-of-order execution, which *might* mean that integer multiplier | |
a4d729f3 AP |
31 | # is pipelined, which in turn *might* be impossible to match... On |
32 | # additional note, SPARC64 V implements FP Multiply-Add instruction, | |
33 | # which is perfectly usable in this context... In other words, as far | |
73b979e6 | 34 | # as Fujitsu SPARC64 V goes, talk to the author:-) |
aa2be094 | 35 | |
a00e414f AP |
36 | # The implementation implies following "non-natural" limitations on |
37 | # input arguments: | |
aa2be094 AP |
38 | # - num may not be less than 4; |
39 | # - num has to be even; | |
aa2be094 AP |
40 | # Failure to meet either condition has no fatal effects, simply |
41 | # doesn't give any performance gain. | |
42 | ||
bcb43bb3 | 43 | # TODO: |
bcb43bb3 AP |
44 | # - modulo-schedule inner loop for better performance (on in-order |
45 | # execution core such as UltraSPARC this shall result in further | |
46 | # noticeable(!) improvement); | |
47 | # - dedicated squaring procedure[?]; | |
48 | ||
2e21922e AP |
49 | ###################################################################### |
50 | # November 2006 | |
51 | # | |
52 | # Modulo-scheduled inner loops allow to interleave floating point and | |
53 | # integer instructions and minimize Read-After-Write penalties. This | |
54 | # results in *further* 20-50% perfromance improvement [depending on | |
55 | # key length, more for longer keys] on USI&II cores and 30-80% - on | |
56 | # USIII&IV. | |
57 | ||
6bd7a4d9 RL |
58 | $output = pop; |
59 | open STDOUT,">$output"; | |
60 | ||
a00e414f | 61 | $fname="bn_mul_mont_fpu"; |
bcb43bb3 | 62 | $bits=32; |
3b4a0225 | 63 | for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } |
bcb43bb3 AP |
64 | |
65 | if ($bits==64) { | |
66 | $bias=2047; | |
67 | $frame=192; | |
68 | } else { | |
69 | $bias=0; | |
70 | $frame=128; # 96 rounded up to largest known cache-line | |
71 | } | |
72 | $locals=64; | |
73 | ||
74 | # In order to provide for 32-/64-bit ABI duality, I keep integers wider | |
75 | # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used | |
76 | # exclusively for pointers, indexes and other small values... | |
77 | # int bn_mul_mont( | |
78 | $rp="%i0"; # BN_ULONG *rp, | |
79 | $ap="%i1"; # const BN_ULONG *ap, | |
80 | $bp="%i2"; # const BN_ULONG *bp, | |
81 | $np="%i3"; # const BN_ULONG *np, | |
4d524040 | 82 | $n0="%i4"; # const BN_ULONG *n0, |
bcb43bb3 AP |
83 | $num="%i5"; # int num); |
84 | ||
aa2be094 | 85 | $tp="%l0"; # t[num] |
bcb43bb3 AP |
86 | $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved |
87 | $ap_h="%l2"; # to these four vectors as double-precision FP values. | |
88 | $np_l="%l3"; # This way a bunch of fxtods are eliminated in second | |
89 | $np_h="%l4"; # loop and L1-cache aliasing is minimized... | |
90 | $i="%l5"; | |
91 | $j="%l6"; | |
92 | $mask="%l7"; # 16-bit mask, 0xffff | |
93 | ||
aa2be094 AP |
94 | $n0="%g4"; # reassigned(!) to "64-bit" register |
95 | $carry="%i4"; # %i4 reused(!) for a carry bit | |
bcb43bb3 AP |
96 | |
97 | # FP register naming chart | |
98 | # | |
99 | # ..HILO | |
100 | # dcba | |
101 | # -------- | |
102 | # LOa | |
103 | # LOb | |
104 | # LOc | |
105 | # LOd | |
106 | # HIa | |
107 | # HIb | |
108 | # HIc | |
109 | # HId | |
110 | # ..a | |
111 | # ..b | |
112 | $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6"; | |
113 | $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14"; | |
114 | $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19"; | |
115 | $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23"; | |
116 | ||
117 | $dota="%f24"; $dotb="%f26"; | |
118 | ||
119 | $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38"; | |
120 | $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46"; | |
121 | $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54"; | |
122 | $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; | |
123 | ||
124 | $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load | |
125 | ||
126 | $code=<<___; | |
bcb43bb3 AP |
127 | .section ".text",#alloc,#execinstr |
128 | ||
129 | .global $fname | |
130 | .align 32 | |
131 | $fname: | |
aa2be094 | 132 | save %sp,-$frame-$locals,%sp |
6df8c74d | 133 | |
aa2be094 AP |
134 | cmp $num,4 |
135 | bl,a,pn %icc,.Lret | |
136 | clr %i0 | |
137 | andcc $num,1,%g0 ! $num has to be even... | |
138 | bnz,a,pn %icc,.Lret | |
139 | clr %i0 ! signal "unsupported input value" | |
760e3535 | 140 | |
aa2be094 | 141 | srl $num,1,$num |
760e3535 | 142 | sethi %hi(0xffff),$mask |
aa2be094 | 143 | ld [%i4+0],$n0 ! $n0 reassigned, remember? |
760e3535 | 144 | or $mask,%lo(0xffff),$mask |
aa2be094 AP |
145 | ld [%i4+4],%o0 |
146 | sllx %o0,32,%o0 | |
147 | or %o0,$n0,$n0 ! $n0=n0[1].n0[0] | |
6df8c74d | 148 | |
aa2be094 | 149 | sll $num,3,$num ! num*=8 |
bcb43bb3 AP |
150 | |
151 | add %sp,$bias,%o0 ! real top of stack | |
152 | sll $num,2,%o1 | |
153 | add %o1,$num,%o1 ! %o1=num*5 | |
154 | sub %o0,%o1,%o0 | |
bcb43bb3 | 155 | and %o0,-2048,%o0 ! optimize TLB utilization |
aa2be094 | 156 | sub %o0,$bias,%sp ! alloca(5*num*8) |
bcb43bb3 | 157 | |
aa2be094 | 158 | rd %asi,%o7 ! save %asi |
bcb43bb3 AP |
159 | add %sp,$bias+$frame+$locals,$tp |
160 | add $tp,$num,$ap_l | |
aa2be094 | 161 | add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends ! |
bcb43bb3 AP |
162 | add $ap_l,$num,$ap_h |
163 | add $ap_h,$num,$np_l | |
164 | add $np_l,$num,$np_h | |
165 | ||
166 | wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads | |
167 | ||
168 | add $rp,$num,$rp ! readjust input pointers to point | |
169 | add $ap,$num,$ap ! at the ends too... | |
170 | add $bp,$num,$bp | |
171 | add $np,$num,$np | |
172 | ||
aa2be094 | 173 | stx %o7,[%sp+$bias+$frame+48] ! save %asi |
bcb43bb3 | 174 | \f |
6df8c74d AP |
175 | sub %g0,$num,$i ! i=-num |
176 | sub %g0,$num,$j ! j=-num | |
bcb43bb3 AP |
177 | |
178 | add $ap,$j,%o3 | |
179 | add $bp,$i,%o4 | |
6df8c74d | 180 | |
87d3af64 AP |
181 | ld [%o3+4],%g1 ! bp[0] |
182 | ld [%o3+0],%o0 | |
183 | ld [%o4+4],%g5 ! ap[0] | |
184 | sllx %g1,32,%g1 | |
185 | ld [%o4+0],%o1 | |
186 | sllx %g5,32,%g5 | |
6df8c74d AP |
187 | or %g1,%o0,%o0 |
188 | or %g5,%o1,%o1 | |
189 | ||
aa2be094 | 190 | add $np,$j,%o5 |
bcb43bb3 AP |
191 | |
192 | mulx %o1,%o0,%o0 ! ap[0]*bp[0] | |
193 | mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0 | |
aa2be094 | 194 | stx %o0,[%sp+$bias+$frame+0] |
bcb43bb3 | 195 | |
6df8c74d | 196 | ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words |
aa2be094 | 197 | fzeros $alo |
6df8c74d | 198 | ld [%o3+4],$ahi_ |
aa2be094 | 199 | fzeros $ahi |
6df8c74d | 200 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words |
aa2be094 | 201 | fzeros $nlo |
6df8c74d | 202 | ld [%o5+4],$nhi_ |
aa2be094 | 203 | fzeros $nhi |
bcb43bb3 AP |
204 | |
205 | ! transfer b[i] to FPU as 4x16-bit values | |
6df8c74d | 206 | ldda [%o4+2]%asi,$ba |
bcb43bb3 | 207 | fxtod $alo,$alo |
6df8c74d | 208 | ldda [%o4+0]%asi,$bb |
bcb43bb3 | 209 | fxtod $ahi,$ahi |
6df8c74d | 210 | ldda [%o4+6]%asi,$bc |
bcb43bb3 | 211 | fxtod $nlo,$nlo |
6df8c74d | 212 | ldda [%o4+4]%asi,$bd |
bcb43bb3 AP |
213 | fxtod $nhi,$nhi |
214 | ||
215 | ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values | |
aa2be094 | 216 | ldda [%sp+$bias+$frame+6]%asi,$na |
bcb43bb3 | 217 | fxtod $ba,$ba |
aa2be094 | 218 | ldda [%sp+$bias+$frame+4]%asi,$nb |
bcb43bb3 | 219 | fxtod $bb,$bb |
aa2be094 | 220 | ldda [%sp+$bias+$frame+2]%asi,$nc |
bcb43bb3 | 221 | fxtod $bc,$bc |
aa2be094 | 222 | ldda [%sp+$bias+$frame+0]%asi,$nd |
bcb43bb3 AP |
223 | fxtod $bd,$bd |
224 | ||
225 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | |
226 | fxtod $na,$na | |
227 | std $ahi,[$ap_h+$j] | |
228 | fxtod $nb,$nb | |
229 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | |
230 | fxtod $nc,$nc | |
231 | std $nhi,[$np_h+$j] | |
232 | fxtod $nd,$nd | |
233 | ||
aa2be094 AP |
234 | fmuld $alo,$ba,$aloa |
235 | fmuld $nlo,$na,$nloa | |
236 | fmuld $alo,$bb,$alob | |
237 | fmuld $nlo,$nb,$nlob | |
238 | fmuld $alo,$bc,$aloc | |
aa2be094 | 239 | faddd $aloa,$nloa,$nloa |
6df8c74d | 240 | fmuld $nlo,$nc,$nloc |
aa2be094 | 241 | fmuld $alo,$bd,$alod |
aa2be094 | 242 | faddd $alob,$nlob,$nlob |
6df8c74d | 243 | fmuld $nlo,$nd,$nlod |
aa2be094 | 244 | fmuld $ahi,$ba,$ahia |
aa2be094 | 245 | faddd $aloc,$nloc,$nloc |
6df8c74d | 246 | fmuld $nhi,$na,$nhia |
aa2be094 | 247 | fmuld $ahi,$bb,$ahib |
aa2be094 | 248 | faddd $alod,$nlod,$nlod |
6df8c74d | 249 | fmuld $nhi,$nb,$nhib |
aa2be094 | 250 | fmuld $ahi,$bc,$ahic |
aa2be094 | 251 | faddd $ahia,$nhia,$nhia |
6df8c74d | 252 | fmuld $nhi,$nc,$nhic |
aa2be094 | 253 | fmuld $ahi,$bd,$ahid |
6df8c74d | 254 | faddd $ahib,$nhib,$nhib |
aa2be094 | 255 | fmuld $nhi,$nd,$nhid |
bcb43bb3 | 256 | |
bcb43bb3 AP |
257 | faddd $ahic,$nhic,$dota ! $nhic |
258 | faddd $ahid,$nhid,$dotb ! $nhid | |
259 | ||
260 | faddd $nloc,$nhia,$nloc | |
261 | faddd $nlod,$nhib,$nlod | |
262 | ||
263 | fdtox $nloa,$nloa | |
264 | fdtox $nlob,$nlob | |
265 | fdtox $nloc,$nloc | |
266 | fdtox $nlod,$nlod | |
267 | ||
268 | std $nloa,[%sp+$bias+$frame+0] | |
2e21922e | 269 | add $j,8,$j |
bcb43bb3 | 270 | std $nlob,[%sp+$bias+$frame+8] |
2e21922e | 271 | add $ap,$j,%o4 |
bcb43bb3 | 272 | std $nloc,[%sp+$bias+$frame+16] |
2e21922e | 273 | add $np,$j,%o5 |
bcb43bb3 | 274 | std $nlod,[%sp+$bias+$frame+24] |
bcb43bb3 | 275 | \f |
1c3d2b94 | 276 | ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words |
aa2be094 | 277 | fzeros $alo |
1c3d2b94 | 278 | ld [%o4+4],$ahi_ |
aa2be094 | 279 | fzeros $ahi |
1c3d2b94 | 280 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words |
aa2be094 | 281 | fzeros $nlo |
1c3d2b94 | 282 | ld [%o5+4],$nhi_ |
aa2be094 | 283 | fzeros $nhi |
bcb43bb3 AP |
284 | |
285 | fxtod $alo,$alo | |
286 | fxtod $ahi,$ahi | |
287 | fxtod $nlo,$nlo | |
288 | fxtod $nhi,$nhi | |
289 | ||
2e21922e | 290 | ldx [%sp+$bias+$frame+0],%o0 |
aa2be094 | 291 | fmuld $alo,$ba,$aloa |
2e21922e | 292 | ldx [%sp+$bias+$frame+8],%o1 |
aa2be094 | 293 | fmuld $nlo,$na,$nloa |
2e21922e | 294 | ldx [%sp+$bias+$frame+16],%o2 |
aa2be094 | 295 | fmuld $alo,$bb,$alob |
2e21922e | 296 | ldx [%sp+$bias+$frame+24],%o3 |
aa2be094 | 297 | fmuld $nlo,$nb,$nlob |
2e21922e AP |
298 | |
299 | srlx %o0,16,%o7 | |
300 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | |
aa2be094 | 301 | fmuld $alo,$bc,$aloc |
2e21922e AP |
302 | add %o7,%o1,%o1 |
303 | std $ahi,[$ap_h+$j] | |
304 | faddd $aloa,$nloa,$nloa | |
6df8c74d | 305 | fmuld $nlo,$nc,$nloc |
2e21922e AP |
306 | srlx %o1,16,%o7 |
307 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | |
aa2be094 | 308 | fmuld $alo,$bd,$alod |
2e21922e AP |
309 | add %o7,%o2,%o2 |
310 | std $nhi,[$np_h+$j] | |
311 | faddd $alob,$nlob,$nlob | |
6df8c74d | 312 | fmuld $nlo,$nd,$nlod |
2e21922e | 313 | srlx %o2,16,%o7 |
aa2be094 | 314 | fmuld $ahi,$ba,$ahia |
2e21922e AP |
315 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] |
316 | faddd $aloc,$nloc,$nloc | |
6df8c74d | 317 | fmuld $nhi,$na,$nhia |
2e21922e AP |
318 | !and %o0,$mask,%o0 |
319 | !and %o1,$mask,%o1 | |
320 | !and %o2,$mask,%o2 | |
321 | !sllx %o1,16,%o1 | |
322 | !sllx %o2,32,%o2 | |
323 | !sllx %o3,48,%o7 | |
324 | !or %o1,%o0,%o0 | |
325 | !or %o2,%o0,%o0 | |
326 | !or %o7,%o0,%o0 ! 64-bit result | |
327 | srlx %o3,16,%g1 ! 34-bit carry | |
aa2be094 | 328 | fmuld $ahi,$bb,$ahib |
2e21922e | 329 | |
aa2be094 | 330 | faddd $alod,$nlod,$nlod |
6df8c74d | 331 | fmuld $nhi,$nb,$nhib |
aa2be094 | 332 | fmuld $ahi,$bc,$ahic |
aa2be094 | 333 | faddd $ahia,$nhia,$nhia |
6df8c74d | 334 | fmuld $nhi,$nc,$nhic |
aa2be094 | 335 | fmuld $ahi,$bd,$ahid |
aa2be094 | 336 | faddd $ahib,$nhib,$nhib |
6df8c74d | 337 | fmuld $nhi,$nd,$nhid |
bcb43bb3 AP |
338 | |
339 | faddd $dota,$nloa,$nloa | |
340 | faddd $dotb,$nlob,$nlob | |
341 | faddd $ahic,$nhic,$dota ! $nhic | |
342 | faddd $ahid,$nhid,$dotb ! $nhid | |
343 | ||
344 | faddd $nloc,$nhia,$nloc | |
345 | faddd $nlod,$nhib,$nlod | |
346 | ||
347 | fdtox $nloa,$nloa | |
348 | fdtox $nlob,$nlob | |
349 | fdtox $nloc,$nloc | |
350 | fdtox $nlod,$nlod | |
351 | ||
352 | std $nloa,[%sp+$bias+$frame+0] | |
353 | std $nlob,[%sp+$bias+$frame+8] | |
2e21922e | 354 | addcc $j,8,$j |
bcb43bb3 | 355 | std $nloc,[%sp+$bias+$frame+16] |
2e21922e | 356 | bz,pn %icc,.L1stskip |
bcb43bb3 | 357 | std $nlod,[%sp+$bias+$frame+24] |
1c3d2b94 | 358 | \f |
23296942 | 359 | .align 32 ! incidentally already aligned ! |
1c3d2b94 | 360 | .L1st: |
1c3d2b94 AP |
361 | add $ap,$j,%o4 |
362 | add $np,$j,%o5 | |
363 | ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words | |
364 | fzeros $alo | |
365 | ld [%o4+4],$ahi_ | |
366 | fzeros $ahi | |
367 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | |
368 | fzeros $nlo | |
369 | ld [%o5+4],$nhi_ | |
370 | fzeros $nhi | |
371 | ||
372 | fxtod $alo,$alo | |
373 | fxtod $ahi,$ahi | |
374 | fxtod $nlo,$nlo | |
375 | fxtod $nhi,$nhi | |
376 | ||
2e21922e | 377 | ldx [%sp+$bias+$frame+0],%o0 |
1c3d2b94 | 378 | fmuld $alo,$ba,$aloa |
2e21922e | 379 | ldx [%sp+$bias+$frame+8],%o1 |
1c3d2b94 | 380 | fmuld $nlo,$na,$nloa |
2e21922e | 381 | ldx [%sp+$bias+$frame+16],%o2 |
1c3d2b94 | 382 | fmuld $alo,$bb,$alob |
2e21922e | 383 | ldx [%sp+$bias+$frame+24],%o3 |
1c3d2b94 | 384 | fmuld $nlo,$nb,$nlob |
2e21922e AP |
385 | |
386 | srlx %o0,16,%o7 | |
387 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | |
1c3d2b94 | 388 | fmuld $alo,$bc,$aloc |
2e21922e AP |
389 | add %o7,%o1,%o1 |
390 | std $ahi,[$ap_h+$j] | |
391 | faddd $aloa,$nloa,$nloa | |
1c3d2b94 | 392 | fmuld $nlo,$nc,$nloc |
2e21922e AP |
393 | srlx %o1,16,%o7 |
394 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | |
1c3d2b94 | 395 | fmuld $alo,$bd,$alod |
2e21922e AP |
396 | add %o7,%o2,%o2 |
397 | std $nhi,[$np_h+$j] | |
398 | faddd $alob,$nlob,$nlob | |
1c3d2b94 | 399 | fmuld $nlo,$nd,$nlod |
2e21922e | 400 | srlx %o2,16,%o7 |
1c3d2b94 | 401 | fmuld $ahi,$ba,$ahia |
2e21922e AP |
402 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] |
403 | and %o0,$mask,%o0 | |
404 | faddd $aloc,$nloc,$nloc | |
1c3d2b94 | 405 | fmuld $nhi,$na,$nhia |
2e21922e AP |
406 | and %o1,$mask,%o1 |
407 | and %o2,$mask,%o2 | |
1c3d2b94 | 408 | fmuld $ahi,$bb,$ahib |
2e21922e AP |
409 | sllx %o1,16,%o1 |
410 | faddd $alod,$nlod,$nlod | |
1c3d2b94 | 411 | fmuld $nhi,$nb,$nhib |
2e21922e | 412 | sllx %o2,32,%o2 |
1c3d2b94 | 413 | fmuld $ahi,$bc,$ahic |
2e21922e AP |
414 | sllx %o3,48,%o7 |
415 | or %o1,%o0,%o0 | |
416 | faddd $ahia,$nhia,$nhia | |
1c3d2b94 | 417 | fmuld $nhi,$nc,$nhic |
2e21922e | 418 | or %o2,%o0,%o0 |
1c3d2b94 | 419 | fmuld $ahi,$bd,$ahid |
2e21922e AP |
420 | or %o7,%o0,%o0 ! 64-bit result |
421 | faddd $ahib,$nhib,$nhib | |
1c3d2b94 | 422 | fmuld $nhi,$nd,$nhid |
2e21922e AP |
423 | addcc %g1,%o0,%o0 |
424 | faddd $dota,$nloa,$nloa | |
425 | srlx %o3,16,%g1 ! 34-bit carry | |
426 | faddd $dotb,$nlob,$nlob | |
427 | bcs,a %xcc,.+8 | |
428 | add %g1,1,%g1 | |
429 | ||
430 | stx %o0,[$tp] ! tp[j-1]= | |
1c3d2b94 | 431 | |
1c3d2b94 AP |
432 | faddd $ahic,$nhic,$dota ! $nhic |
433 | faddd $ahid,$nhid,$dotb ! $nhid | |
434 | ||
435 | faddd $nloc,$nhia,$nloc | |
436 | faddd $nlod,$nhib,$nlod | |
437 | ||
438 | fdtox $nloa,$nloa | |
439 | fdtox $nlob,$nlob | |
440 | fdtox $nloc,$nloc | |
441 | fdtox $nlod,$nlod | |
442 | ||
443 | std $nloa,[%sp+$bias+$frame+0] | |
444 | std $nlob,[%sp+$bias+$frame+8] | |
445 | std $nloc,[%sp+$bias+$frame+16] | |
446 | std $nlod,[%sp+$bias+$frame+24] | |
447 | ||
aa2be094 AP |
448 | addcc $j,8,$j |
449 | bnz,pt %icc,.L1st | |
bcb43bb3 | 450 | add $tp,8,$tp |
1c3d2b94 AP |
451 | \f |
452 | .L1stskip: | |
ebae8092 AP |
453 | fdtox $dota,$dota |
454 | fdtox $dotb,$dotb | |
455 | ||
1c3d2b94 AP |
456 | ldx [%sp+$bias+$frame+0],%o0 |
457 | ldx [%sp+$bias+$frame+8],%o1 | |
458 | ldx [%sp+$bias+$frame+16],%o2 | |
459 | ldx [%sp+$bias+$frame+24],%o3 | |
460 | ||
461 | srlx %o0,16,%o7 | |
ebae8092 | 462 | std $dota,[%sp+$bias+$frame+32] |
1c3d2b94 | 463 | add %o7,%o1,%o1 |
ebae8092 | 464 | std $dotb,[%sp+$bias+$frame+40] |
1c3d2b94 AP |
465 | srlx %o1,16,%o7 |
466 | add %o7,%o2,%o2 | |
467 | srlx %o2,16,%o7 | |
468 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | |
469 | and %o0,$mask,%o0 | |
470 | and %o1,$mask,%o1 | |
471 | and %o2,$mask,%o2 | |
472 | sllx %o1,16,%o1 | |
473 | sllx %o2,32,%o2 | |
474 | sllx %o3,48,%o7 | |
475 | or %o1,%o0,%o0 | |
476 | or %o2,%o0,%o0 | |
477 | or %o7,%o0,%o0 ! 64-bit result | |
ebae8092 | 478 | ldx [%sp+$bias+$frame+32],%o4 |
1c3d2b94 | 479 | addcc %g1,%o0,%o0 |
ebae8092 | 480 | ldx [%sp+$bias+$frame+40],%o5 |
1c3d2b94 AP |
481 | srlx %o3,16,%g1 ! 34-bit carry |
482 | bcs,a %xcc,.+8 | |
483 | add %g1,1,%g1 | |
484 | ||
485 | stx %o0,[$tp] ! tp[j-1]= | |
486 | add $tp,8,$tp | |
bcb43bb3 | 487 | |
ebae8092 AP |
488 | srlx %o4,16,%o7 |
489 | add %o7,%o5,%o5 | |
490 | and %o4,$mask,%o4 | |
491 | sllx %o5,16,%o7 | |
492 | or %o7,%o4,%o4 | |
493 | addcc %g1,%o4,%o4 | |
494 | srlx %o5,48,%g1 | |
bcb43bb3 AP |
495 | bcs,a %xcc,.+8 |
496 | add %g1,1,%g1 | |
497 | ||
498 | mov %g1,$carry | |
ebae8092 | 499 | stx %o4,[$tp] ! tp[num-1]= |
bcb43bb3 AP |
500 | \f |
501 | ba .Louter | |
502 | add $i,8,$i | |
503 | .align 32 | |
504 | .Louter: | |
6df8c74d | 505 | sub %g0,$num,$j ! j=-num |
bcb43bb3 AP |
506 | add %sp,$bias+$frame+$locals,$tp |
507 | ||
87d3af64 | 508 | add $ap,$j,%o3 |
bcb43bb3 | 509 | add $bp,$i,%o4 |
6df8c74d | 510 | |
87d3af64 AP |
511 | ld [%o3+4],%g1 ! bp[i] |
512 | ld [%o3+0],%o0 | |
513 | ld [%o4+4],%g5 ! ap[0] | |
514 | sllx %g1,32,%g1 | |
515 | ld [%o4+0],%o1 | |
516 | sllx %g5,32,%g5 | |
6df8c74d AP |
517 | or %g1,%o0,%o0 |
518 | or %g5,%o1,%o1 | |
519 | ||
bcb43bb3 AP |
520 | ldx [$tp],%o2 ! tp[0] |
521 | mulx %o1,%o0,%o0 | |
522 | addcc %o2,%o0,%o0 | |
523 | mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 | |
aa2be094 | 524 | stx %o0,[%sp+$bias+$frame+0] |
bcb43bb3 | 525 | |
bcb43bb3 | 526 | ! transfer b[i] to FPU as 4x16-bit values |
6df8c74d AP |
527 | ldda [%o4+2]%asi,$ba |
528 | ldda [%o4+0]%asi,$bb | |
529 | ldda [%o4+6]%asi,$bc | |
530 | ldda [%o4+4]%asi,$bd | |
bcb43bb3 AP |
531 | |
532 | ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values | |
aa2be094 | 533 | ldda [%sp+$bias+$frame+6]%asi,$na |
bcb43bb3 | 534 | fxtod $ba,$ba |
aa2be094 | 535 | ldda [%sp+$bias+$frame+4]%asi,$nb |
bcb43bb3 | 536 | fxtod $bb,$bb |
aa2be094 | 537 | ldda [%sp+$bias+$frame+2]%asi,$nc |
bcb43bb3 | 538 | fxtod $bc,$bc |
aa2be094 | 539 | ldda [%sp+$bias+$frame+0]%asi,$nd |
bcb43bb3 AP |
540 | fxtod $bd,$bd |
541 | ldd [$ap_l+$j],$alo ! load a[j] in double format | |
542 | fxtod $na,$na | |
543 | ldd [$ap_h+$j],$ahi | |
544 | fxtod $nb,$nb | |
545 | ldd [$np_l+$j],$nlo ! load n[j] in double format | |
546 | fxtod $nc,$nc | |
547 | ldd [$np_h+$j],$nhi | |
548 | fxtod $nd,$nd | |
549 | ||
aa2be094 AP |
550 | fmuld $alo,$ba,$aloa |
551 | fmuld $nlo,$na,$nloa | |
552 | fmuld $alo,$bb,$alob | |
553 | fmuld $nlo,$nb,$nlob | |
554 | fmuld $alo,$bc,$aloc | |
aa2be094 | 555 | faddd $aloa,$nloa,$nloa |
6df8c74d | 556 | fmuld $nlo,$nc,$nloc |
aa2be094 | 557 | fmuld $alo,$bd,$alod |
aa2be094 | 558 | faddd $alob,$nlob,$nlob |
6df8c74d | 559 | fmuld $nlo,$nd,$nlod |
aa2be094 | 560 | fmuld $ahi,$ba,$ahia |
aa2be094 | 561 | faddd $aloc,$nloc,$nloc |
6df8c74d | 562 | fmuld $nhi,$na,$nhia |
aa2be094 | 563 | fmuld $ahi,$bb,$ahib |
aa2be094 | 564 | faddd $alod,$nlod,$nlod |
6df8c74d | 565 | fmuld $nhi,$nb,$nhib |
aa2be094 | 566 | fmuld $ahi,$bc,$ahic |
aa2be094 | 567 | faddd $ahia,$nhia,$nhia |
6df8c74d | 568 | fmuld $nhi,$nc,$nhic |
aa2be094 | 569 | fmuld $ahi,$bd,$ahid |
6df8c74d | 570 | faddd $ahib,$nhib,$nhib |
aa2be094 | 571 | fmuld $nhi,$nd,$nhid |
bcb43bb3 | 572 | |
bcb43bb3 AP |
573 | faddd $ahic,$nhic,$dota ! $nhic |
574 | faddd $ahid,$nhid,$dotb ! $nhid | |
575 | ||
576 | faddd $nloc,$nhia,$nloc | |
577 | faddd $nlod,$nhib,$nlod | |
578 | ||
579 | fdtox $nloa,$nloa | |
580 | fdtox $nlob,$nlob | |
581 | fdtox $nloc,$nloc | |
582 | fdtox $nlod,$nlod | |
583 | ||
584 | std $nloa,[%sp+$bias+$frame+0] | |
585 | std $nlob,[%sp+$bias+$frame+8] | |
586 | std $nloc,[%sp+$bias+$frame+16] | |
2e21922e | 587 | add $j,8,$j |
bcb43bb3 | 588 | std $nlod,[%sp+$bias+$frame+24] |
2e21922e AP |
589 | \f |
590 | ldd [$ap_l+$j],$alo ! load a[j] in double format | |
591 | ldd [$ap_h+$j],$ahi | |
592 | ldd [$np_l+$j],$nlo ! load n[j] in double format | |
593 | ldd [$np_h+$j],$nhi | |
594 | ||
595 | fmuld $alo,$ba,$aloa | |
596 | fmuld $nlo,$na,$nloa | |
597 | fmuld $alo,$bb,$alob | |
598 | fmuld $nlo,$nb,$nlob | |
599 | fmuld $alo,$bc,$aloc | |
bcb43bb3 | 600 | ldx [%sp+$bias+$frame+0],%o0 |
2e21922e AP |
601 | faddd $aloa,$nloa,$nloa |
602 | fmuld $nlo,$nc,$nloc | |
bcb43bb3 | 603 | ldx [%sp+$bias+$frame+8],%o1 |
2e21922e | 604 | fmuld $alo,$bd,$alod |
bcb43bb3 | 605 | ldx [%sp+$bias+$frame+16],%o2 |
2e21922e AP |
606 | faddd $alob,$nlob,$nlob |
607 | fmuld $nlo,$nd,$nlod | |
bcb43bb3 | 608 | ldx [%sp+$bias+$frame+24],%o3 |
2e21922e | 609 | fmuld $ahi,$ba,$ahia |
bcb43bb3 AP |
610 | |
611 | srlx %o0,16,%o7 | |
2e21922e AP |
612 | faddd $aloc,$nloc,$nloc |
613 | fmuld $nhi,$na,$nhia | |
bcb43bb3 | 614 | add %o7,%o1,%o1 |
2e21922e | 615 | fmuld $ahi,$bb,$ahib |
bcb43bb3 | 616 | srlx %o1,16,%o7 |
2e21922e AP |
617 | faddd $alod,$nlod,$nlod |
618 | fmuld $nhi,$nb,$nhib | |
bcb43bb3 | 619 | add %o7,%o2,%o2 |
2e21922e | 620 | fmuld $ahi,$bc,$ahic |
bcb43bb3 | 621 | srlx %o2,16,%o7 |
2e21922e AP |
622 | faddd $ahia,$nhia,$nhia |
623 | fmuld $nhi,$nc,$nhic | |
bcb43bb3 AP |
624 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] |
625 | ! why? | |
626 | and %o0,$mask,%o0 | |
2e21922e | 627 | fmuld $ahi,$bd,$ahid |
bcb43bb3 AP |
628 | and %o1,$mask,%o1 |
629 | and %o2,$mask,%o2 | |
2e21922e AP |
630 | faddd $ahib,$nhib,$nhib |
631 | fmuld $nhi,$nd,$nhid | |
bcb43bb3 | 632 | sllx %o1,16,%o1 |
2e21922e | 633 | faddd $dota,$nloa,$nloa |
bcb43bb3 | 634 | sllx %o2,32,%o2 |
2e21922e | 635 | faddd $dotb,$nlob,$nlob |
bcb43bb3 AP |
636 | sllx %o3,48,%o7 |
637 | or %o1,%o0,%o0 | |
2e21922e | 638 | faddd $ahic,$nhic,$dota ! $nhic |
bcb43bb3 | 639 | or %o2,%o0,%o0 |
2e21922e | 640 | faddd $ahid,$nhid,$dotb ! $nhid |
bcb43bb3 AP |
641 | or %o7,%o0,%o0 ! 64-bit result |
642 | ldx [$tp],%o7 | |
2e21922e | 643 | faddd $nloc,$nhia,$nloc |
bcb43bb3 AP |
644 | addcc %o7,%o0,%o0 |
645 | ! end-of-why? | |
2e21922e | 646 | faddd $nlod,$nhib,$nlod |
bcb43bb3 | 647 | srlx %o3,16,%g1 ! 34-bit carry |
2e21922e | 648 | fdtox $nloa,$nloa |
bcb43bb3 AP |
649 | bcs,a %xcc,.+8 |
650 | add %g1,1,%g1 | |
bcb43bb3 | 651 | |
bcb43bb3 AP |
652 | fdtox $nlob,$nlob |
653 | fdtox $nloc,$nloc | |
654 | fdtox $nlod,$nlod | |
655 | ||
656 | std $nloa,[%sp+$bias+$frame+0] | |
657 | std $nlob,[%sp+$bias+$frame+8] | |
2e21922e | 658 | addcc $j,8,$j |
bcb43bb3 | 659 | std $nloc,[%sp+$bias+$frame+16] |
2e21922e | 660 | bz,pn %icc,.Linnerskip |
bcb43bb3 | 661 | std $nlod,[%sp+$bias+$frame+24] |
1c3d2b94 | 662 | \f |
ebae8092 AP |
663 | ba .Linner |
664 | nop | |
665 | .align 32 | |
1c3d2b94 | 666 | .Linner: |
2e21922e AP |
667 | ldd [$ap_l+$j],$alo ! load a[j] in double format |
668 | ldd [$ap_h+$j],$ahi | |
669 | ldd [$np_l+$j],$nlo ! load n[j] in double format | |
670 | ldd [$np_h+$j],$nhi | |
671 | ||
672 | fmuld $alo,$ba,$aloa | |
673 | fmuld $nlo,$na,$nloa | |
674 | fmuld $alo,$bb,$alob | |
675 | fmuld $nlo,$nb,$nlob | |
676 | fmuld $alo,$bc,$aloc | |
bcb43bb3 | 677 | ldx [%sp+$bias+$frame+0],%o0 |
2e21922e AP |
678 | faddd $aloa,$nloa,$nloa |
679 | fmuld $nlo,$nc,$nloc | |
bcb43bb3 | 680 | ldx [%sp+$bias+$frame+8],%o1 |
2e21922e | 681 | fmuld $alo,$bd,$alod |
bcb43bb3 | 682 | ldx [%sp+$bias+$frame+16],%o2 |
2e21922e AP |
683 | faddd $alob,$nlob,$nlob |
684 | fmuld $nlo,$nd,$nlod | |
bcb43bb3 | 685 | ldx [%sp+$bias+$frame+24],%o3 |
2e21922e | 686 | fmuld $ahi,$ba,$ahia |
bcb43bb3 AP |
687 | |
688 | srlx %o0,16,%o7 | |
2e21922e AP |
689 | faddd $aloc,$nloc,$nloc |
690 | fmuld $nhi,$na,$nhia | |
bcb43bb3 | 691 | add %o7,%o1,%o1 |
2e21922e | 692 | fmuld $ahi,$bb,$ahib |
bcb43bb3 | 693 | srlx %o1,16,%o7 |
2e21922e AP |
694 | faddd $alod,$nlod,$nlod |
695 | fmuld $nhi,$nb,$nhib | |
bcb43bb3 | 696 | add %o7,%o2,%o2 |
2e21922e | 697 | fmuld $ahi,$bc,$ahic |
bcb43bb3 | 698 | srlx %o2,16,%o7 |
2e21922e AP |
699 | faddd $ahia,$nhia,$nhia |
700 | fmuld $nhi,$nc,$nhic | |
bcb43bb3 AP |
701 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] |
702 | and %o0,$mask,%o0 | |
2e21922e | 703 | fmuld $ahi,$bd,$ahid |
bcb43bb3 AP |
704 | and %o1,$mask,%o1 |
705 | and %o2,$mask,%o2 | |
2e21922e AP |
706 | faddd $ahib,$nhib,$nhib |
707 | fmuld $nhi,$nd,$nhid | |
bcb43bb3 | 708 | sllx %o1,16,%o1 |
2e21922e | 709 | faddd $dota,$nloa,$nloa |
bcb43bb3 | 710 | sllx %o2,32,%o2 |
2e21922e | 711 | faddd $dotb,$nlob,$nlob |
bcb43bb3 AP |
712 | sllx %o3,48,%o7 |
713 | or %o1,%o0,%o0 | |
2e21922e | 714 | faddd $ahic,$nhic,$dota ! $nhic |
bcb43bb3 | 715 | or %o2,%o0,%o0 |
2e21922e | 716 | faddd $ahid,$nhid,$dotb ! $nhid |
bcb43bb3 | 717 | or %o7,%o0,%o0 ! 64-bit result |
2e21922e | 718 | faddd $nloc,$nhia,$nloc |
bcb43bb3 | 719 | addcc %g1,%o0,%o0 |
ebae8092 | 720 | ldx [$tp+8],%o7 ! tp[j] |
2e21922e | 721 | faddd $nlod,$nhib,$nlod |
bcb43bb3 | 722 | srlx %o3,16,%g1 ! 34-bit carry |
2e21922e | 723 | fdtox $nloa,$nloa |
bcb43bb3 AP |
724 | bcs,a %xcc,.+8 |
725 | add %g1,1,%g1 | |
2e21922e | 726 | fdtox $nlob,$nlob |
bcb43bb3 | 727 | addcc %o7,%o0,%o0 |
2e21922e | 728 | fdtox $nloc,$nloc |
bcb43bb3 AP |
729 | bcs,a %xcc,.+8 |
730 | add %g1,1,%g1 | |
731 | ||
732 | stx %o0,[$tp] ! tp[j-1] | |
2e21922e | 733 | fdtox $nlod,$nlod |
1c3d2b94 AP |
734 | |
735 | std $nloa,[%sp+$bias+$frame+0] | |
736 | std $nlob,[%sp+$bias+$frame+8] | |
737 | std $nloc,[%sp+$bias+$frame+16] | |
aa2be094 | 738 | addcc $j,8,$j |
2e21922e | 739 | std $nlod,[%sp+$bias+$frame+24] |
aa2be094 | 740 | bnz,pt %icc,.Linner |
bcb43bb3 | 741 | add $tp,8,$tp |
1c3d2b94 AP |
742 | \f |
743 | .Linnerskip: | |
2e21922e AP |
744 | fdtox $dota,$dota |
745 | fdtox $dotb,$dotb | |
746 | ||
1c3d2b94 AP |
747 | ldx [%sp+$bias+$frame+0],%o0 |
748 | ldx [%sp+$bias+$frame+8],%o1 | |
749 | ldx [%sp+$bias+$frame+16],%o2 | |
750 | ldx [%sp+$bias+$frame+24],%o3 | |
751 | ||
752 | srlx %o0,16,%o7 | |
2e21922e | 753 | std $dota,[%sp+$bias+$frame+32] |
1c3d2b94 | 754 | add %o7,%o1,%o1 |
2e21922e | 755 | std $dotb,[%sp+$bias+$frame+40] |
1c3d2b94 AP |
756 | srlx %o1,16,%o7 |
757 | add %o7,%o2,%o2 | |
758 | srlx %o2,16,%o7 | |
759 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | |
760 | and %o0,$mask,%o0 | |
761 | and %o1,$mask,%o1 | |
762 | and %o2,$mask,%o2 | |
763 | sllx %o1,16,%o1 | |
764 | sllx %o2,32,%o2 | |
765 | sllx %o3,48,%o7 | |
766 | or %o1,%o0,%o0 | |
767 | or %o2,%o0,%o0 | |
2e21922e | 768 | ldx [%sp+$bias+$frame+32],%o4 |
1c3d2b94 | 769 | or %o7,%o0,%o0 ! 64-bit result |
2e21922e | 770 | ldx [%sp+$bias+$frame+40],%o5 |
1c3d2b94 | 771 | addcc %g1,%o0,%o0 |
2e21922e | 772 | ldx [$tp+8],%o7 ! tp[j] |
1c3d2b94 AP |
773 | srlx %o3,16,%g1 ! 34-bit carry |
774 | bcs,a %xcc,.+8 | |
775 | add %g1,1,%g1 | |
776 | ||
1c3d2b94 AP |
777 | addcc %o7,%o0,%o0 |
778 | bcs,a %xcc,.+8 | |
779 | add %g1,1,%g1 | |
780 | ||
781 | stx %o0,[$tp] ! tp[j-1] | |
782 | add $tp,8,$tp | |
bcb43bb3 | 783 | |
2e21922e AP |
784 | srlx %o4,16,%o7 |
785 | add %o7,%o5,%o5 | |
786 | and %o4,$mask,%o4 | |
787 | sllx %o5,16,%o7 | |
788 | or %o7,%o4,%o4 | |
789 | addcc %g1,%o4,%o4 | |
790 | srlx %o5,48,%g1 | |
bcb43bb3 AP |
791 | bcs,a %xcc,.+8 |
792 | add %g1,1,%g1 | |
793 | ||
2e21922e AP |
794 | addcc $carry,%o4,%o4 |
795 | stx %o4,[$tp] ! tp[num-1] | |
bcb43bb3 AP |
796 | mov %g1,$carry |
797 | bcs,a %xcc,.+8 | |
798 | add $carry,1,$carry | |
799 | ||
aa2be094 AP |
800 | addcc $i,8,$i |
801 | bnz %icc,.Louter | |
bcb43bb3 AP |
802 | nop |
803 | \f | |
7d9cf7c0 | 804 | add $tp,8,$tp ! adjust tp to point at the end |
7d9cf7c0 | 805 | orn %g0,%g0,%g4 |
7d9cf7c0 | 806 | sub %g0,$num,%o7 ! n=-num |
23296942 | 807 | ba .Lsub |
673c55a2 | 808 | subcc %g0,%g0,%g0 ! clear %icc.c |
23296942 AP |
809 | |
810 | .align 32 | |
bcb43bb3 | 811 | .Lsub: |
87d3af64 AP |
812 | ldx [$tp+%o7],%o0 |
813 | add $np,%o7,%g1 | |
814 | ld [%g1+0],%o2 | |
815 | ld [%g1+4],%o3 | |
816 | srlx %o0,32,%o1 | |
817 | subccc %o0,%o2,%o2 | |
818 | add $rp,%o7,%g1 | |
819 | subccc %o1,%o3,%o3 | |
820 | st %o2,[%g1+0] | |
aa2be094 AP |
821 | add %o7,8,%o7 |
822 | brnz,pt %o7,.Lsub | |
87d3af64 | 823 | st %o3,[%g1+4] |
7d9cf7c0 | 824 | subc $carry,0,%g4 |
6df8c74d | 825 | sub %g0,$num,%o7 ! n=-num |
23296942 AP |
826 | ba .Lcopy |
827 | nop | |
bcb43bb3 | 828 | |
23296942 | 829 | .align 32 |
bcb43bb3 | 830 | .Lcopy: |
aa2be094 | 831 | ldx [$tp+%o7],%o0 |
87d3af64 | 832 | add $rp,%o7,%g1 |
7d9cf7c0 AP |
833 | ld [%g1+0],%o2 |
834 | ld [%g1+4],%o3 | |
835 | stx %g0,[$tp+%o7] | |
836 | and %o0,%g4,%o0 | |
837 | srlx %o0,32,%o1 | |
838 | andn %o2,%g4,%o2 | |
839 | andn %o3,%g4,%o3 | |
840 | or %o2,%o0,%o0 | |
841 | or %o3,%o1,%o1 | |
87d3af64 | 842 | st %o0,[%g1+0] |
aa2be094 AP |
843 | add %o7,8,%o7 |
844 | brnz,pt %o7,.Lcopy | |
87d3af64 | 845 | st %o1,[%g1+4] |
6df8c74d | 846 | sub %g0,$num,%o7 ! n=-num |
bcb43bb3 | 847 | |
bcb43bb3 | 848 | .Lzap: |
aa2be094 AP |
849 | stx %g0,[$ap_l+%o7] |
850 | stx %g0,[$ap_h+%o7] | |
851 | stx %g0,[$np_l+%o7] | |
852 | stx %g0,[$np_h+%o7] | |
853 | add %o7,8,%o7 | |
854 | brnz,pt %o7,.Lzap | |
bcb43bb3 AP |
855 | nop |
856 | ||
857 | ldx [%sp+$bias+$frame+48],%o7 | |
858 | wr %g0,%o7,%asi ! restore %asi | |
859 | ||
860 | mov 1,%i0 | |
aa2be094 | 861 | .Lret: |
bcb43bb3 AP |
862 | ret |
863 | restore | |
864 | .type $fname,#function | |
865 | .size $fname,(.-$fname) | |
87d3af64 | 866 | .asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>" |
23296942 | 867 | .align 32 |
bcb43bb3 AP |
868 | ___ |
869 | ||
870 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | |
3b4a0225 AP |
871 | |
872 | # Below substitution makes it possible to compile without demanding | |
478b50cf | 873 | # VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I |
3b4a0225 AP |
874 | # dare to do this, because VIS capability is detected at run-time now |
875 | # and this routine is not called on CPU not capable to execute it. Do | |
876 | # note that fzeros is not the only VIS dependency! Another dependency | |
877 | # is implicit and is just _a_ numerical value loaded to %asi register, | |
878 | # which assembler can't recognize as VIS specific... | |
879 | $code =~ s/fzeros\s+%f([0-9]+)/ | |
880 | sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1) | |
881 | /gem; | |
882 | ||
bcb43bb3 | 883 | print $code; |
3b4a0225 | 884 | # flush |
bcb43bb3 | 885 | close STDOUT; |