]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/sparcv9a-mont.pl
Add OpenSSL copyright to .pl files
[thirdparty/openssl.git] / crypto / bn / asm / sparcv9a-mont.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
bcb43bb3
AP
9
10# ====================================================================
11# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
2e21922e
AP
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
bcb43bb3
AP
15# ====================================================================
16
aa2be094
AP
17# October 2005
18#
bcb43bb3
AP
19# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
20# Because unlike integer multiplier, which simply stalls whole CPU,
21# FPU is fully pipelined and can effectively emit 48 bit partial
22# product every cycle. Why not blended SPARC v9? One can argue that
23# making this module dependent on UltraSPARC VIS extension limits its
a4d729f3
AP
24# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
25# implementations from compatibility matrix. But the rest, whole Sun
26# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
27# VIS extension instructions used in this module. This is considered
73b979e6
AP
28# good enough to not care about HAL SPARC64 users [if any] who have
29# integer-only pure SPARCv9 module to "fall down" to.
bcb43bb3
AP
30
31# USI&II cores currently exhibit uniform 2x improvement [over pre-
32# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
33# performance improves few percents for shorter keys and worsens few
aa2be094 34# percents for longer keys. This is because USIII integer multiplier
bcb43bb3
AP
35# is >3x faster than USI&II one, which is harder to match [but see
36# TODO list below]. It should also be noted that SPARC64 V features
37# out-of-order execution, which *might* mean that integer multiplier
a4d729f3
AP
38# is pipelined, which in turn *might* be impossible to match... On
39# additional note, SPARC64 V implements FP Multiply-Add instruction,
40# which is perfectly usable in this context... In other words, as far
73b979e6 41# as Fujitsu SPARC64 V goes, talk to the author:-)
aa2be094 42
a00e414f
AP
43# The implementation implies following "non-natural" limitations on
44# input arguments:
aa2be094
AP
45# - num may not be less than 4;
46# - num has to be even;
aa2be094
AP
47# Failure to meet either condition has no fatal effects, simply
48# doesn't give any performance gain.
49
bcb43bb3 50# TODO:
bcb43bb3
AP
51# - modulo-schedule inner loop for better performance (on in-order
52# execution core such as UltraSPARC this shall result in further
53# noticeable(!) improvement);
54# - dedicated squaring procedure[?];
55
2e21922e
AP
56######################################################################
57# November 2006
58#
59# Modulo-scheduled inner loops allow to interleave floating point and
60# integer instructions and minimize Read-After-Write penalties. This
61# results in *further* 20-50% perfromance improvement [depending on
62# key length, more for longer keys] on USI&II cores and 30-80% - on
63# USIII&IV.
64
6bd7a4d9
RL
65$output = pop;
66open STDOUT,">$output";
67
a00e414f 68$fname="bn_mul_mont_fpu";
eb77e888
AP
69
70$frame="STACK_FRAME";
71$bias="STACK_BIAS";
bcb43bb3
AP
72$locals=64;
73
74# In order to provide for 32-/64-bit ABI duality, I keep integers wider
75# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
76# exclusively for pointers, indexes and other small values...
77# int bn_mul_mont(
78$rp="%i0"; # BN_ULONG *rp,
79$ap="%i1"; # const BN_ULONG *ap,
80$bp="%i2"; # const BN_ULONG *bp,
81$np="%i3"; # const BN_ULONG *np,
4d524040 82$n0="%i4"; # const BN_ULONG *n0,
bcb43bb3
AP
83$num="%i5"; # int num);
84
aa2be094 85$tp="%l0"; # t[num]
bcb43bb3
AP
86$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
87$ap_h="%l2"; # to these four vectors as double-precision FP values.
88$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
89$np_h="%l4"; # loop and L1-cache aliasing is minimized...
90$i="%l5";
91$j="%l6";
92$mask="%l7"; # 16-bit mask, 0xffff
93
aa2be094
AP
94$n0="%g4"; # reassigned(!) to "64-bit" register
95$carry="%i4"; # %i4 reused(!) for a carry bit
bcb43bb3
AP
96
97# FP register naming chart
98#
99# ..HILO
100# dcba
101# --------
102# LOa
103# LOb
104# LOc
105# LOd
106# HIa
107# HIb
108# HIc
109# HId
110# ..a
111# ..b
112$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
113$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
114$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
115$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
116
117$dota="%f24"; $dotb="%f26";
118
119$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
120$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
121$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
122$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
123
124$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
125
126$code=<<___;
eb77e888
AP
127#include "sparc_arch.h"
128
bcb43bb3
AP
129.section ".text",#alloc,#execinstr
130
131.global $fname
132.align 32
133$fname:
aa2be094 134 save %sp,-$frame-$locals,%sp
6df8c74d 135
aa2be094
AP
136 cmp $num,4
137 bl,a,pn %icc,.Lret
138 clr %i0
139 andcc $num,1,%g0 ! $num has to be even...
140 bnz,a,pn %icc,.Lret
141 clr %i0 ! signal "unsupported input value"
760e3535 142
aa2be094 143 srl $num,1,$num
760e3535 144 sethi %hi(0xffff),$mask
aa2be094 145 ld [%i4+0],$n0 ! $n0 reassigned, remember?
760e3535 146 or $mask,%lo(0xffff),$mask
aa2be094
AP
147 ld [%i4+4],%o0
148 sllx %o0,32,%o0
149 or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
6df8c74d 150
aa2be094 151 sll $num,3,$num ! num*=8
bcb43bb3
AP
152
153 add %sp,$bias,%o0 ! real top of stack
154 sll $num,2,%o1
155 add %o1,$num,%o1 ! %o1=num*5
156 sub %o0,%o1,%o0
bcb43bb3 157 and %o0,-2048,%o0 ! optimize TLB utilization
aa2be094 158 sub %o0,$bias,%sp ! alloca(5*num*8)
bcb43bb3 159
aa2be094 160 rd %asi,%o7 ! save %asi
bcb43bb3
AP
161 add %sp,$bias+$frame+$locals,$tp
162 add $tp,$num,$ap_l
aa2be094 163 add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
bcb43bb3
AP
164 add $ap_l,$num,$ap_h
165 add $ap_h,$num,$np_l
166 add $np_l,$num,$np_h
167
168 wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
169
170 add $rp,$num,$rp ! readjust input pointers to point
171 add $ap,$num,$ap ! at the ends too...
172 add $bp,$num,$bp
173 add $np,$num,$np
174
aa2be094 175 stx %o7,[%sp+$bias+$frame+48] ! save %asi
bcb43bb3 176\f
6df8c74d
AP
177 sub %g0,$num,$i ! i=-num
178 sub %g0,$num,$j ! j=-num
bcb43bb3
AP
179
180 add $ap,$j,%o3
181 add $bp,$i,%o4
6df8c74d 182
87d3af64
AP
183 ld [%o3+4],%g1 ! bp[0]
184 ld [%o3+0],%o0
185 ld [%o4+4],%g5 ! ap[0]
186 sllx %g1,32,%g1
187 ld [%o4+0],%o1
188 sllx %g5,32,%g5
6df8c74d
AP
189 or %g1,%o0,%o0
190 or %g5,%o1,%o1
191
aa2be094 192 add $np,$j,%o5
bcb43bb3
AP
193
194 mulx %o1,%o0,%o0 ! ap[0]*bp[0]
195 mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
aa2be094 196 stx %o0,[%sp+$bias+$frame+0]
bcb43bb3 197
6df8c74d 198 ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
aa2be094 199 fzeros $alo
6df8c74d 200 ld [%o3+4],$ahi_
aa2be094 201 fzeros $ahi
6df8c74d 202 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
aa2be094 203 fzeros $nlo
6df8c74d 204 ld [%o5+4],$nhi_
aa2be094 205 fzeros $nhi
bcb43bb3
AP
206
207 ! transfer b[i] to FPU as 4x16-bit values
6df8c74d 208 ldda [%o4+2]%asi,$ba
bcb43bb3 209 fxtod $alo,$alo
6df8c74d 210 ldda [%o4+0]%asi,$bb
bcb43bb3 211 fxtod $ahi,$ahi
6df8c74d 212 ldda [%o4+6]%asi,$bc
bcb43bb3 213 fxtod $nlo,$nlo
6df8c74d 214 ldda [%o4+4]%asi,$bd
bcb43bb3
AP
215 fxtod $nhi,$nhi
216
217 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
aa2be094 218 ldda [%sp+$bias+$frame+6]%asi,$na
bcb43bb3 219 fxtod $ba,$ba
aa2be094 220 ldda [%sp+$bias+$frame+4]%asi,$nb
bcb43bb3 221 fxtod $bb,$bb
aa2be094 222 ldda [%sp+$bias+$frame+2]%asi,$nc
bcb43bb3 223 fxtod $bc,$bc
aa2be094 224 ldda [%sp+$bias+$frame+0]%asi,$nd
bcb43bb3
AP
225 fxtod $bd,$bd
226
227 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
228 fxtod $na,$na
229 std $ahi,[$ap_h+$j]
230 fxtod $nb,$nb
231 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
232 fxtod $nc,$nc
233 std $nhi,[$np_h+$j]
234 fxtod $nd,$nd
235
aa2be094
AP
236 fmuld $alo,$ba,$aloa
237 fmuld $nlo,$na,$nloa
238 fmuld $alo,$bb,$alob
239 fmuld $nlo,$nb,$nlob
240 fmuld $alo,$bc,$aloc
aa2be094 241 faddd $aloa,$nloa,$nloa
6df8c74d 242 fmuld $nlo,$nc,$nloc
aa2be094 243 fmuld $alo,$bd,$alod
aa2be094 244 faddd $alob,$nlob,$nlob
6df8c74d 245 fmuld $nlo,$nd,$nlod
aa2be094 246 fmuld $ahi,$ba,$ahia
aa2be094 247 faddd $aloc,$nloc,$nloc
6df8c74d 248 fmuld $nhi,$na,$nhia
aa2be094 249 fmuld $ahi,$bb,$ahib
aa2be094 250 faddd $alod,$nlod,$nlod
6df8c74d 251 fmuld $nhi,$nb,$nhib
aa2be094 252 fmuld $ahi,$bc,$ahic
aa2be094 253 faddd $ahia,$nhia,$nhia
6df8c74d 254 fmuld $nhi,$nc,$nhic
aa2be094 255 fmuld $ahi,$bd,$ahid
6df8c74d 256 faddd $ahib,$nhib,$nhib
aa2be094 257 fmuld $nhi,$nd,$nhid
bcb43bb3 258
bcb43bb3
AP
259 faddd $ahic,$nhic,$dota ! $nhic
260 faddd $ahid,$nhid,$dotb ! $nhid
261
262 faddd $nloc,$nhia,$nloc
263 faddd $nlod,$nhib,$nlod
264
265 fdtox $nloa,$nloa
266 fdtox $nlob,$nlob
267 fdtox $nloc,$nloc
268 fdtox $nlod,$nlod
269
270 std $nloa,[%sp+$bias+$frame+0]
2e21922e 271 add $j,8,$j
bcb43bb3 272 std $nlob,[%sp+$bias+$frame+8]
2e21922e 273 add $ap,$j,%o4
bcb43bb3 274 std $nloc,[%sp+$bias+$frame+16]
2e21922e 275 add $np,$j,%o5
bcb43bb3 276 std $nlod,[%sp+$bias+$frame+24]
bcb43bb3 277\f
1c3d2b94 278 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
aa2be094 279 fzeros $alo
1c3d2b94 280 ld [%o4+4],$ahi_
aa2be094 281 fzeros $ahi
1c3d2b94 282 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
aa2be094 283 fzeros $nlo
1c3d2b94 284 ld [%o5+4],$nhi_
aa2be094 285 fzeros $nhi
bcb43bb3
AP
286
287 fxtod $alo,$alo
288 fxtod $ahi,$ahi
289 fxtod $nlo,$nlo
290 fxtod $nhi,$nhi
291
2e21922e 292 ldx [%sp+$bias+$frame+0],%o0
aa2be094 293 fmuld $alo,$ba,$aloa
2e21922e 294 ldx [%sp+$bias+$frame+8],%o1
aa2be094 295 fmuld $nlo,$na,$nloa
2e21922e 296 ldx [%sp+$bias+$frame+16],%o2
aa2be094 297 fmuld $alo,$bb,$alob
2e21922e 298 ldx [%sp+$bias+$frame+24],%o3
aa2be094 299 fmuld $nlo,$nb,$nlob
2e21922e
AP
300
301 srlx %o0,16,%o7
302 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
aa2be094 303 fmuld $alo,$bc,$aloc
2e21922e
AP
304 add %o7,%o1,%o1
305 std $ahi,[$ap_h+$j]
306 faddd $aloa,$nloa,$nloa
6df8c74d 307 fmuld $nlo,$nc,$nloc
2e21922e
AP
308 srlx %o1,16,%o7
309 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
aa2be094 310 fmuld $alo,$bd,$alod
2e21922e
AP
311 add %o7,%o2,%o2
312 std $nhi,[$np_h+$j]
313 faddd $alob,$nlob,$nlob
6df8c74d 314 fmuld $nlo,$nd,$nlod
2e21922e 315 srlx %o2,16,%o7
aa2be094 316 fmuld $ahi,$ba,$ahia
2e21922e
AP
317 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
318 faddd $aloc,$nloc,$nloc
6df8c74d 319 fmuld $nhi,$na,$nhia
2e21922e
AP
320 !and %o0,$mask,%o0
321 !and %o1,$mask,%o1
322 !and %o2,$mask,%o2
323 !sllx %o1,16,%o1
324 !sllx %o2,32,%o2
325 !sllx %o3,48,%o7
326 !or %o1,%o0,%o0
327 !or %o2,%o0,%o0
328 !or %o7,%o0,%o0 ! 64-bit result
329 srlx %o3,16,%g1 ! 34-bit carry
aa2be094 330 fmuld $ahi,$bb,$ahib
2e21922e 331
aa2be094 332 faddd $alod,$nlod,$nlod
6df8c74d 333 fmuld $nhi,$nb,$nhib
aa2be094 334 fmuld $ahi,$bc,$ahic
aa2be094 335 faddd $ahia,$nhia,$nhia
6df8c74d 336 fmuld $nhi,$nc,$nhic
aa2be094 337 fmuld $ahi,$bd,$ahid
aa2be094 338 faddd $ahib,$nhib,$nhib
6df8c74d 339 fmuld $nhi,$nd,$nhid
bcb43bb3
AP
340
341 faddd $dota,$nloa,$nloa
342 faddd $dotb,$nlob,$nlob
343 faddd $ahic,$nhic,$dota ! $nhic
344 faddd $ahid,$nhid,$dotb ! $nhid
345
346 faddd $nloc,$nhia,$nloc
347 faddd $nlod,$nhib,$nlod
348
349 fdtox $nloa,$nloa
350 fdtox $nlob,$nlob
351 fdtox $nloc,$nloc
352 fdtox $nlod,$nlod
353
354 std $nloa,[%sp+$bias+$frame+0]
355 std $nlob,[%sp+$bias+$frame+8]
2e21922e 356 addcc $j,8,$j
bcb43bb3 357 std $nloc,[%sp+$bias+$frame+16]
2e21922e 358 bz,pn %icc,.L1stskip
bcb43bb3 359 std $nlod,[%sp+$bias+$frame+24]
1c3d2b94 360\f
23296942 361.align 32 ! incidentally already aligned !
1c3d2b94 362.L1st:
1c3d2b94
AP
363 add $ap,$j,%o4
364 add $np,$j,%o5
365 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
366 fzeros $alo
367 ld [%o4+4],$ahi_
368 fzeros $ahi
369 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
370 fzeros $nlo
371 ld [%o5+4],$nhi_
372 fzeros $nhi
373
374 fxtod $alo,$alo
375 fxtod $ahi,$ahi
376 fxtod $nlo,$nlo
377 fxtod $nhi,$nhi
378
2e21922e 379 ldx [%sp+$bias+$frame+0],%o0
1c3d2b94 380 fmuld $alo,$ba,$aloa
2e21922e 381 ldx [%sp+$bias+$frame+8],%o1
1c3d2b94 382 fmuld $nlo,$na,$nloa
2e21922e 383 ldx [%sp+$bias+$frame+16],%o2
1c3d2b94 384 fmuld $alo,$bb,$alob
2e21922e 385 ldx [%sp+$bias+$frame+24],%o3
1c3d2b94 386 fmuld $nlo,$nb,$nlob
2e21922e
AP
387
388 srlx %o0,16,%o7
389 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
1c3d2b94 390 fmuld $alo,$bc,$aloc
2e21922e
AP
391 add %o7,%o1,%o1
392 std $ahi,[$ap_h+$j]
393 faddd $aloa,$nloa,$nloa
1c3d2b94 394 fmuld $nlo,$nc,$nloc
2e21922e
AP
395 srlx %o1,16,%o7
396 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
1c3d2b94 397 fmuld $alo,$bd,$alod
2e21922e
AP
398 add %o7,%o2,%o2
399 std $nhi,[$np_h+$j]
400 faddd $alob,$nlob,$nlob
1c3d2b94 401 fmuld $nlo,$nd,$nlod
2e21922e 402 srlx %o2,16,%o7
1c3d2b94 403 fmuld $ahi,$ba,$ahia
2e21922e
AP
404 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
405 and %o0,$mask,%o0
406 faddd $aloc,$nloc,$nloc
1c3d2b94 407 fmuld $nhi,$na,$nhia
2e21922e
AP
408 and %o1,$mask,%o1
409 and %o2,$mask,%o2
1c3d2b94 410 fmuld $ahi,$bb,$ahib
2e21922e
AP
411 sllx %o1,16,%o1
412 faddd $alod,$nlod,$nlod
1c3d2b94 413 fmuld $nhi,$nb,$nhib
2e21922e 414 sllx %o2,32,%o2
1c3d2b94 415 fmuld $ahi,$bc,$ahic
2e21922e
AP
416 sllx %o3,48,%o7
417 or %o1,%o0,%o0
418 faddd $ahia,$nhia,$nhia
1c3d2b94 419 fmuld $nhi,$nc,$nhic
2e21922e 420 or %o2,%o0,%o0
1c3d2b94 421 fmuld $ahi,$bd,$ahid
2e21922e
AP
422 or %o7,%o0,%o0 ! 64-bit result
423 faddd $ahib,$nhib,$nhib
1c3d2b94 424 fmuld $nhi,$nd,$nhid
2e21922e
AP
425 addcc %g1,%o0,%o0
426 faddd $dota,$nloa,$nloa
427 srlx %o3,16,%g1 ! 34-bit carry
428 faddd $dotb,$nlob,$nlob
429 bcs,a %xcc,.+8
430 add %g1,1,%g1
431
432 stx %o0,[$tp] ! tp[j-1]=
1c3d2b94 433
1c3d2b94
AP
434 faddd $ahic,$nhic,$dota ! $nhic
435 faddd $ahid,$nhid,$dotb ! $nhid
436
437 faddd $nloc,$nhia,$nloc
438 faddd $nlod,$nhib,$nlod
439
440 fdtox $nloa,$nloa
441 fdtox $nlob,$nlob
442 fdtox $nloc,$nloc
443 fdtox $nlod,$nlod
444
445 std $nloa,[%sp+$bias+$frame+0]
446 std $nlob,[%sp+$bias+$frame+8]
447 std $nloc,[%sp+$bias+$frame+16]
448 std $nlod,[%sp+$bias+$frame+24]
449
aa2be094
AP
450 addcc $j,8,$j
451 bnz,pt %icc,.L1st
bcb43bb3 452 add $tp,8,$tp
1c3d2b94
AP
453\f
454.L1stskip:
ebae8092
AP
455 fdtox $dota,$dota
456 fdtox $dotb,$dotb
457
1c3d2b94
AP
458 ldx [%sp+$bias+$frame+0],%o0
459 ldx [%sp+$bias+$frame+8],%o1
460 ldx [%sp+$bias+$frame+16],%o2
461 ldx [%sp+$bias+$frame+24],%o3
462
463 srlx %o0,16,%o7
ebae8092 464 std $dota,[%sp+$bias+$frame+32]
1c3d2b94 465 add %o7,%o1,%o1
ebae8092 466 std $dotb,[%sp+$bias+$frame+40]
1c3d2b94
AP
467 srlx %o1,16,%o7
468 add %o7,%o2,%o2
469 srlx %o2,16,%o7
470 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
471 and %o0,$mask,%o0
472 and %o1,$mask,%o1
473 and %o2,$mask,%o2
474 sllx %o1,16,%o1
475 sllx %o2,32,%o2
476 sllx %o3,48,%o7
477 or %o1,%o0,%o0
478 or %o2,%o0,%o0
479 or %o7,%o0,%o0 ! 64-bit result
ebae8092 480 ldx [%sp+$bias+$frame+32],%o4
1c3d2b94 481 addcc %g1,%o0,%o0
ebae8092 482 ldx [%sp+$bias+$frame+40],%o5
1c3d2b94
AP
483 srlx %o3,16,%g1 ! 34-bit carry
484 bcs,a %xcc,.+8
485 add %g1,1,%g1
486
487 stx %o0,[$tp] ! tp[j-1]=
488 add $tp,8,$tp
bcb43bb3 489
ebae8092
AP
490 srlx %o4,16,%o7
491 add %o7,%o5,%o5
492 and %o4,$mask,%o4
493 sllx %o5,16,%o7
494 or %o7,%o4,%o4
495 addcc %g1,%o4,%o4
496 srlx %o5,48,%g1
bcb43bb3
AP
497 bcs,a %xcc,.+8
498 add %g1,1,%g1
499
500 mov %g1,$carry
ebae8092 501 stx %o4,[$tp] ! tp[num-1]=
bcb43bb3
AP
502\f
503 ba .Louter
504 add $i,8,$i
505.align 32
506.Louter:
6df8c74d 507 sub %g0,$num,$j ! j=-num
bcb43bb3
AP
508 add %sp,$bias+$frame+$locals,$tp
509
87d3af64 510 add $ap,$j,%o3
bcb43bb3 511 add $bp,$i,%o4
6df8c74d 512
87d3af64
AP
513 ld [%o3+4],%g1 ! bp[i]
514 ld [%o3+0],%o0
515 ld [%o4+4],%g5 ! ap[0]
516 sllx %g1,32,%g1
517 ld [%o4+0],%o1
518 sllx %g5,32,%g5
6df8c74d
AP
519 or %g1,%o0,%o0
520 or %g5,%o1,%o1
521
bcb43bb3
AP
522 ldx [$tp],%o2 ! tp[0]
523 mulx %o1,%o0,%o0
524 addcc %o2,%o0,%o0
525 mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
aa2be094 526 stx %o0,[%sp+$bias+$frame+0]
bcb43bb3 527
bcb43bb3 528 ! transfer b[i] to FPU as 4x16-bit values
6df8c74d
AP
529 ldda [%o4+2]%asi,$ba
530 ldda [%o4+0]%asi,$bb
531 ldda [%o4+6]%asi,$bc
532 ldda [%o4+4]%asi,$bd
bcb43bb3
AP
533
534 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
aa2be094 535 ldda [%sp+$bias+$frame+6]%asi,$na
bcb43bb3 536 fxtod $ba,$ba
aa2be094 537 ldda [%sp+$bias+$frame+4]%asi,$nb
bcb43bb3 538 fxtod $bb,$bb
aa2be094 539 ldda [%sp+$bias+$frame+2]%asi,$nc
bcb43bb3 540 fxtod $bc,$bc
aa2be094 541 ldda [%sp+$bias+$frame+0]%asi,$nd
bcb43bb3
AP
542 fxtod $bd,$bd
543 ldd [$ap_l+$j],$alo ! load a[j] in double format
544 fxtod $na,$na
545 ldd [$ap_h+$j],$ahi
546 fxtod $nb,$nb
547 ldd [$np_l+$j],$nlo ! load n[j] in double format
548 fxtod $nc,$nc
549 ldd [$np_h+$j],$nhi
550 fxtod $nd,$nd
551
aa2be094
AP
552 fmuld $alo,$ba,$aloa
553 fmuld $nlo,$na,$nloa
554 fmuld $alo,$bb,$alob
555 fmuld $nlo,$nb,$nlob
556 fmuld $alo,$bc,$aloc
aa2be094 557 faddd $aloa,$nloa,$nloa
6df8c74d 558 fmuld $nlo,$nc,$nloc
aa2be094 559 fmuld $alo,$bd,$alod
aa2be094 560 faddd $alob,$nlob,$nlob
6df8c74d 561 fmuld $nlo,$nd,$nlod
aa2be094 562 fmuld $ahi,$ba,$ahia
aa2be094 563 faddd $aloc,$nloc,$nloc
6df8c74d 564 fmuld $nhi,$na,$nhia
aa2be094 565 fmuld $ahi,$bb,$ahib
aa2be094 566 faddd $alod,$nlod,$nlod
6df8c74d 567 fmuld $nhi,$nb,$nhib
aa2be094 568 fmuld $ahi,$bc,$ahic
aa2be094 569 faddd $ahia,$nhia,$nhia
6df8c74d 570 fmuld $nhi,$nc,$nhic
aa2be094 571 fmuld $ahi,$bd,$ahid
6df8c74d 572 faddd $ahib,$nhib,$nhib
aa2be094 573 fmuld $nhi,$nd,$nhid
bcb43bb3 574
bcb43bb3
AP
575 faddd $ahic,$nhic,$dota ! $nhic
576 faddd $ahid,$nhid,$dotb ! $nhid
577
578 faddd $nloc,$nhia,$nloc
579 faddd $nlod,$nhib,$nlod
580
581 fdtox $nloa,$nloa
582 fdtox $nlob,$nlob
583 fdtox $nloc,$nloc
584 fdtox $nlod,$nlod
585
586 std $nloa,[%sp+$bias+$frame+0]
587 std $nlob,[%sp+$bias+$frame+8]
588 std $nloc,[%sp+$bias+$frame+16]
2e21922e 589 add $j,8,$j
bcb43bb3 590 std $nlod,[%sp+$bias+$frame+24]
2e21922e
AP
591\f
592 ldd [$ap_l+$j],$alo ! load a[j] in double format
593 ldd [$ap_h+$j],$ahi
594 ldd [$np_l+$j],$nlo ! load n[j] in double format
595 ldd [$np_h+$j],$nhi
596
597 fmuld $alo,$ba,$aloa
598 fmuld $nlo,$na,$nloa
599 fmuld $alo,$bb,$alob
600 fmuld $nlo,$nb,$nlob
601 fmuld $alo,$bc,$aloc
bcb43bb3 602 ldx [%sp+$bias+$frame+0],%o0
2e21922e
AP
603 faddd $aloa,$nloa,$nloa
604 fmuld $nlo,$nc,$nloc
bcb43bb3 605 ldx [%sp+$bias+$frame+8],%o1
2e21922e 606 fmuld $alo,$bd,$alod
bcb43bb3 607 ldx [%sp+$bias+$frame+16],%o2
2e21922e
AP
608 faddd $alob,$nlob,$nlob
609 fmuld $nlo,$nd,$nlod
bcb43bb3 610 ldx [%sp+$bias+$frame+24],%o3
2e21922e 611 fmuld $ahi,$ba,$ahia
bcb43bb3
AP
612
613 srlx %o0,16,%o7
2e21922e
AP
614 faddd $aloc,$nloc,$nloc
615 fmuld $nhi,$na,$nhia
bcb43bb3 616 add %o7,%o1,%o1
2e21922e 617 fmuld $ahi,$bb,$ahib
bcb43bb3 618 srlx %o1,16,%o7
2e21922e
AP
619 faddd $alod,$nlod,$nlod
620 fmuld $nhi,$nb,$nhib
bcb43bb3 621 add %o7,%o2,%o2
2e21922e 622 fmuld $ahi,$bc,$ahic
bcb43bb3 623 srlx %o2,16,%o7
2e21922e
AP
624 faddd $ahia,$nhia,$nhia
625 fmuld $nhi,$nc,$nhic
bcb43bb3
AP
626 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
627 ! why?
628 and %o0,$mask,%o0
2e21922e 629 fmuld $ahi,$bd,$ahid
bcb43bb3
AP
630 and %o1,$mask,%o1
631 and %o2,$mask,%o2
2e21922e
AP
632 faddd $ahib,$nhib,$nhib
633 fmuld $nhi,$nd,$nhid
bcb43bb3 634 sllx %o1,16,%o1
2e21922e 635 faddd $dota,$nloa,$nloa
bcb43bb3 636 sllx %o2,32,%o2
2e21922e 637 faddd $dotb,$nlob,$nlob
bcb43bb3
AP
638 sllx %o3,48,%o7
639 or %o1,%o0,%o0
2e21922e 640 faddd $ahic,$nhic,$dota ! $nhic
bcb43bb3 641 or %o2,%o0,%o0
2e21922e 642 faddd $ahid,$nhid,$dotb ! $nhid
bcb43bb3
AP
643 or %o7,%o0,%o0 ! 64-bit result
644 ldx [$tp],%o7
2e21922e 645 faddd $nloc,$nhia,$nloc
bcb43bb3
AP
646 addcc %o7,%o0,%o0
647 ! end-of-why?
2e21922e 648 faddd $nlod,$nhib,$nlod
bcb43bb3 649 srlx %o3,16,%g1 ! 34-bit carry
2e21922e 650 fdtox $nloa,$nloa
bcb43bb3
AP
651 bcs,a %xcc,.+8
652 add %g1,1,%g1
bcb43bb3 653
bcb43bb3
AP
654 fdtox $nlob,$nlob
655 fdtox $nloc,$nloc
656 fdtox $nlod,$nlod
657
658 std $nloa,[%sp+$bias+$frame+0]
659 std $nlob,[%sp+$bias+$frame+8]
2e21922e 660 addcc $j,8,$j
bcb43bb3 661 std $nloc,[%sp+$bias+$frame+16]
2e21922e 662 bz,pn %icc,.Linnerskip
bcb43bb3 663 std $nlod,[%sp+$bias+$frame+24]
1c3d2b94 664\f
ebae8092
AP
665 ba .Linner
666 nop
667.align 32
1c3d2b94 668.Linner:
2e21922e
AP
669 ldd [$ap_l+$j],$alo ! load a[j] in double format
670 ldd [$ap_h+$j],$ahi
671 ldd [$np_l+$j],$nlo ! load n[j] in double format
672 ldd [$np_h+$j],$nhi
673
674 fmuld $alo,$ba,$aloa
675 fmuld $nlo,$na,$nloa
676 fmuld $alo,$bb,$alob
677 fmuld $nlo,$nb,$nlob
678 fmuld $alo,$bc,$aloc
bcb43bb3 679 ldx [%sp+$bias+$frame+0],%o0
2e21922e
AP
680 faddd $aloa,$nloa,$nloa
681 fmuld $nlo,$nc,$nloc
bcb43bb3 682 ldx [%sp+$bias+$frame+8],%o1
2e21922e 683 fmuld $alo,$bd,$alod
bcb43bb3 684 ldx [%sp+$bias+$frame+16],%o2
2e21922e
AP
685 faddd $alob,$nlob,$nlob
686 fmuld $nlo,$nd,$nlod
bcb43bb3 687 ldx [%sp+$bias+$frame+24],%o3
2e21922e 688 fmuld $ahi,$ba,$ahia
bcb43bb3
AP
689
690 srlx %o0,16,%o7
2e21922e
AP
691 faddd $aloc,$nloc,$nloc
692 fmuld $nhi,$na,$nhia
bcb43bb3 693 add %o7,%o1,%o1
2e21922e 694 fmuld $ahi,$bb,$ahib
bcb43bb3 695 srlx %o1,16,%o7
2e21922e
AP
696 faddd $alod,$nlod,$nlod
697 fmuld $nhi,$nb,$nhib
bcb43bb3 698 add %o7,%o2,%o2
2e21922e 699 fmuld $ahi,$bc,$ahic
bcb43bb3 700 srlx %o2,16,%o7
2e21922e
AP
701 faddd $ahia,$nhia,$nhia
702 fmuld $nhi,$nc,$nhic
bcb43bb3
AP
703 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
704 and %o0,$mask,%o0
2e21922e 705 fmuld $ahi,$bd,$ahid
bcb43bb3
AP
706 and %o1,$mask,%o1
707 and %o2,$mask,%o2
2e21922e
AP
708 faddd $ahib,$nhib,$nhib
709 fmuld $nhi,$nd,$nhid
bcb43bb3 710 sllx %o1,16,%o1
2e21922e 711 faddd $dota,$nloa,$nloa
bcb43bb3 712 sllx %o2,32,%o2
2e21922e 713 faddd $dotb,$nlob,$nlob
bcb43bb3
AP
714 sllx %o3,48,%o7
715 or %o1,%o0,%o0
2e21922e 716 faddd $ahic,$nhic,$dota ! $nhic
bcb43bb3 717 or %o2,%o0,%o0
2e21922e 718 faddd $ahid,$nhid,$dotb ! $nhid
bcb43bb3 719 or %o7,%o0,%o0 ! 64-bit result
2e21922e 720 faddd $nloc,$nhia,$nloc
bcb43bb3 721 addcc %g1,%o0,%o0
ebae8092 722 ldx [$tp+8],%o7 ! tp[j]
2e21922e 723 faddd $nlod,$nhib,$nlod
bcb43bb3 724 srlx %o3,16,%g1 ! 34-bit carry
2e21922e 725 fdtox $nloa,$nloa
bcb43bb3
AP
726 bcs,a %xcc,.+8
727 add %g1,1,%g1
2e21922e 728 fdtox $nlob,$nlob
bcb43bb3 729 addcc %o7,%o0,%o0
2e21922e 730 fdtox $nloc,$nloc
bcb43bb3
AP
731 bcs,a %xcc,.+8
732 add %g1,1,%g1
733
734 stx %o0,[$tp] ! tp[j-1]
2e21922e 735 fdtox $nlod,$nlod
1c3d2b94
AP
736
737 std $nloa,[%sp+$bias+$frame+0]
738 std $nlob,[%sp+$bias+$frame+8]
739 std $nloc,[%sp+$bias+$frame+16]
aa2be094 740 addcc $j,8,$j
2e21922e 741 std $nlod,[%sp+$bias+$frame+24]
aa2be094 742 bnz,pt %icc,.Linner
bcb43bb3 743 add $tp,8,$tp
1c3d2b94
AP
744\f
745.Linnerskip:
2e21922e
AP
746 fdtox $dota,$dota
747 fdtox $dotb,$dotb
748
1c3d2b94
AP
749 ldx [%sp+$bias+$frame+0],%o0
750 ldx [%sp+$bias+$frame+8],%o1
751 ldx [%sp+$bias+$frame+16],%o2
752 ldx [%sp+$bias+$frame+24],%o3
753
754 srlx %o0,16,%o7
2e21922e 755 std $dota,[%sp+$bias+$frame+32]
1c3d2b94 756 add %o7,%o1,%o1
2e21922e 757 std $dotb,[%sp+$bias+$frame+40]
1c3d2b94
AP
758 srlx %o1,16,%o7
759 add %o7,%o2,%o2
760 srlx %o2,16,%o7
761 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
762 and %o0,$mask,%o0
763 and %o1,$mask,%o1
764 and %o2,$mask,%o2
765 sllx %o1,16,%o1
766 sllx %o2,32,%o2
767 sllx %o3,48,%o7
768 or %o1,%o0,%o0
769 or %o2,%o0,%o0
2e21922e 770 ldx [%sp+$bias+$frame+32],%o4
1c3d2b94 771 or %o7,%o0,%o0 ! 64-bit result
2e21922e 772 ldx [%sp+$bias+$frame+40],%o5
1c3d2b94 773 addcc %g1,%o0,%o0
2e21922e 774 ldx [$tp+8],%o7 ! tp[j]
1c3d2b94
AP
775 srlx %o3,16,%g1 ! 34-bit carry
776 bcs,a %xcc,.+8
777 add %g1,1,%g1
778
1c3d2b94
AP
779 addcc %o7,%o0,%o0
780 bcs,a %xcc,.+8
781 add %g1,1,%g1
782
783 stx %o0,[$tp] ! tp[j-1]
784 add $tp,8,$tp
bcb43bb3 785
2e21922e
AP
786 srlx %o4,16,%o7
787 add %o7,%o5,%o5
788 and %o4,$mask,%o4
789 sllx %o5,16,%o7
790 or %o7,%o4,%o4
791 addcc %g1,%o4,%o4
792 srlx %o5,48,%g1
bcb43bb3
AP
793 bcs,a %xcc,.+8
794 add %g1,1,%g1
795
2e21922e
AP
796 addcc $carry,%o4,%o4
797 stx %o4,[$tp] ! tp[num-1]
bcb43bb3
AP
798 mov %g1,$carry
799 bcs,a %xcc,.+8
800 add $carry,1,$carry
801
aa2be094
AP
802 addcc $i,8,$i
803 bnz %icc,.Louter
bcb43bb3
AP
804 nop
805\f
7d9cf7c0 806 add $tp,8,$tp ! adjust tp to point at the end
7d9cf7c0 807 orn %g0,%g0,%g4
7d9cf7c0 808 sub %g0,$num,%o7 ! n=-num
23296942 809 ba .Lsub
673c55a2 810 subcc %g0,%g0,%g0 ! clear %icc.c
23296942
AP
811
812.align 32
bcb43bb3 813.Lsub:
87d3af64
AP
814 ldx [$tp+%o7],%o0
815 add $np,%o7,%g1
816 ld [%g1+0],%o2
817 ld [%g1+4],%o3
818 srlx %o0,32,%o1
819 subccc %o0,%o2,%o2
820 add $rp,%o7,%g1
821 subccc %o1,%o3,%o3
822 st %o2,[%g1+0]
aa2be094
AP
823 add %o7,8,%o7
824 brnz,pt %o7,.Lsub
87d3af64 825 st %o3,[%g1+4]
7d9cf7c0 826 subc $carry,0,%g4
6df8c74d 827 sub %g0,$num,%o7 ! n=-num
23296942
AP
828 ba .Lcopy
829 nop
bcb43bb3 830
23296942 831.align 32
bcb43bb3 832.Lcopy:
aa2be094 833 ldx [$tp+%o7],%o0
87d3af64 834 add $rp,%o7,%g1
7d9cf7c0
AP
835 ld [%g1+0],%o2
836 ld [%g1+4],%o3
837 stx %g0,[$tp+%o7]
838 and %o0,%g4,%o0
839 srlx %o0,32,%o1
840 andn %o2,%g4,%o2
841 andn %o3,%g4,%o3
842 or %o2,%o0,%o0
843 or %o3,%o1,%o1
87d3af64 844 st %o0,[%g1+0]
aa2be094
AP
845 add %o7,8,%o7
846 brnz,pt %o7,.Lcopy
87d3af64 847 st %o1,[%g1+4]
6df8c74d 848 sub %g0,$num,%o7 ! n=-num
bcb43bb3 849
bcb43bb3 850.Lzap:
aa2be094
AP
851 stx %g0,[$ap_l+%o7]
852 stx %g0,[$ap_h+%o7]
853 stx %g0,[$np_l+%o7]
854 stx %g0,[$np_h+%o7]
855 add %o7,8,%o7
856 brnz,pt %o7,.Lzap
bcb43bb3
AP
857 nop
858
859 ldx [%sp+$bias+$frame+48],%o7
860 wr %g0,%o7,%asi ! restore %asi
861
862 mov 1,%i0
aa2be094 863.Lret:
bcb43bb3
AP
864 ret
865 restore
866.type $fname,#function
867.size $fname,(.-$fname)
87d3af64 868.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
23296942 869.align 32
bcb43bb3
AP
870___
871
872$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3b4a0225
AP
873
874# Below substitution makes it possible to compile without demanding
478b50cf 875# VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
3b4a0225
AP
876# dare to do this, because VIS capability is detected at run-time now
877# and this routine is not called on CPU not capable to execute it. Do
878# note that fzeros is not the only VIS dependency! Another dependency
879# is implicit and is just _a_ numerical value loaded to %asi register,
880# which assembler can't recognize as VIS specific...
881$code =~ s/fzeros\s+%f([0-9]+)/
882 sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
883 /gem;
884
bcb43bb3 885print $code;
3b4a0225 886# flush
bcb43bb3 887close STDOUT;