]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/sparcv9a-mont.pl
Update copyright year
[thirdparty/openssl.git] / crypto / bn / asm / sparcv9a-mont.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
54b40531 2# Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
367ace68 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
bcb43bb3
AP
9
10# ====================================================================
e3713c36 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
2e21922e
AP
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
bcb43bb3
AP
15# ====================================================================
16
aa2be094
AP
17# October 2005
18#
bcb43bb3
AP
19# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
20# Because unlike integer multiplier, which simply stalls whole CPU,
21# FPU is fully pipelined and can effectively emit 48 bit partial
22# product every cycle. Why not blended SPARC v9? One can argue that
23# making this module dependent on UltraSPARC VIS extension limits its
a4d729f3
AP
24# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
25# implementations from compatibility matrix. But the rest, whole Sun
26# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
27# VIS extension instructions used in this module. This is considered
73b979e6
AP
28# good enough to not care about HAL SPARC64 users [if any] who have
29# integer-only pure SPARCv9 module to "fall down" to.
bcb43bb3
AP
30
31# USI&II cores currently exhibit uniform 2x improvement [over pre-
32# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
33# performance improves few percents for shorter keys and worsens few
aa2be094 34# percents for longer keys. This is because USIII integer multiplier
bcb43bb3
AP
35# is >3x faster than USI&II one, which is harder to match [but see
36# TODO list below]. It should also be noted that SPARC64 V features
37# out-of-order execution, which *might* mean that integer multiplier
a4d729f3
AP
38# is pipelined, which in turn *might* be impossible to match... On
39# additional note, SPARC64 V implements FP Multiply-Add instruction,
40# which is perfectly usable in this context... In other words, as far
73b979e6 41# as Fujitsu SPARC64 V goes, talk to the author:-)
aa2be094 42
a00e414f
AP
43# The implementation implies following "non-natural" limitations on
44# input arguments:
aa2be094
AP
45# - num may not be less than 4;
46# - num has to be even;
aa2be094
AP
47# Failure to meet either condition has no fatal effects, simply
48# doesn't give any performance gain.
49
bcb43bb3 50# TODO:
bcb43bb3
AP
51# - modulo-schedule inner loop for better performance (on in-order
52# execution core such as UltraSPARC this shall result in further
53# noticeable(!) improvement);
54# - dedicated squaring procedure[?];
55
2e21922e
AP
56######################################################################
57# November 2006
58#
59# Modulo-scheduled inner loops allow to interleave floating point and
60# integer instructions and minimize Read-After-Write penalties. This
60250017 61# results in *further* 20-50% performance improvement [depending on
2e21922e
AP
62# key length, more for longer keys] on USI&II cores and 30-80% - on
63# USIII&IV.
64
1aa89a7a
RL
65# $output is the last argument if it looks like a file (it has an extension)
66$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
67
68$output and open STDOUT,">$output";
6bd7a4d9 69
a00e414f 70$fname="bn_mul_mont_fpu";
eb77e888
AP
71
72$frame="STACK_FRAME";
73$bias="STACK_BIAS";
bcb43bb3
AP
74$locals=64;
75
76# In order to provide for 32-/64-bit ABI duality, I keep integers wider
77# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
78# exclusively for pointers, indexes and other small values...
79# int bn_mul_mont(
80$rp="%i0"; # BN_ULONG *rp,
81$ap="%i1"; # const BN_ULONG *ap,
82$bp="%i2"; # const BN_ULONG *bp,
83$np="%i3"; # const BN_ULONG *np,
4d524040 84$n0="%i4"; # const BN_ULONG *n0,
bcb43bb3
AP
85$num="%i5"; # int num);
86
aa2be094 87$tp="%l0"; # t[num]
bcb43bb3
AP
88$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
89$ap_h="%l2"; # to these four vectors as double-precision FP values.
90$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
91$np_h="%l4"; # loop and L1-cache aliasing is minimized...
92$i="%l5";
93$j="%l6";
94$mask="%l7"; # 16-bit mask, 0xffff
95
aa2be094
AP
96$n0="%g4"; # reassigned(!) to "64-bit" register
97$carry="%i4"; # %i4 reused(!) for a carry bit
bcb43bb3
AP
98
99# FP register naming chart
100#
101# ..HILO
102# dcba
103# --------
104# LOa
105# LOb
106# LOc
107# LOd
108# HIa
109# HIb
110# HIc
111# HId
112# ..a
113# ..b
114$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
115$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
116$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
117$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
118
119$dota="%f24"; $dotb="%f26";
120
121$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
122$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
123$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
124$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
125
126$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
127
128$code=<<___;
52f7e44e
TM
129#ifndef __ASSEMBLER__
130# define __ASSEMBLER__ 1
131#endif
132#include "crypto/sparc_arch.h"
eb77e888 133
bcb43bb3
AP
134.section ".text",#alloc,#execinstr
135
136.global $fname
137.align 32
138$fname:
aa2be094 139 save %sp,-$frame-$locals,%sp
6df8c74d 140
aa2be094
AP
141 cmp $num,4
142 bl,a,pn %icc,.Lret
143 clr %i0
144 andcc $num,1,%g0 ! $num has to be even...
145 bnz,a,pn %icc,.Lret
146 clr %i0 ! signal "unsupported input value"
760e3535 147
aa2be094 148 srl $num,1,$num
760e3535 149 sethi %hi(0xffff),$mask
aa2be094 150 ld [%i4+0],$n0 ! $n0 reassigned, remember?
760e3535 151 or $mask,%lo(0xffff),$mask
aa2be094
AP
152 ld [%i4+4],%o0
153 sllx %o0,32,%o0
154 or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
6df8c74d 155
aa2be094 156 sll $num,3,$num ! num*=8
bcb43bb3
AP
157
158 add %sp,$bias,%o0 ! real top of stack
159 sll $num,2,%o1
160 add %o1,$num,%o1 ! %o1=num*5
161 sub %o0,%o1,%o0
bcb43bb3 162 and %o0,-2048,%o0 ! optimize TLB utilization
aa2be094 163 sub %o0,$bias,%sp ! alloca(5*num*8)
bcb43bb3 164
aa2be094 165 rd %asi,%o7 ! save %asi
bcb43bb3
AP
166 add %sp,$bias+$frame+$locals,$tp
167 add $tp,$num,$ap_l
aa2be094 168 add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
bcb43bb3
AP
169 add $ap_l,$num,$ap_h
170 add $ap_h,$num,$np_l
171 add $np_l,$num,$np_h
172
173 wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
174
175 add $rp,$num,$rp ! readjust input pointers to point
176 add $ap,$num,$ap ! at the ends too...
177 add $bp,$num,$bp
178 add $np,$num,$np
179
aa2be094 180 stx %o7,[%sp+$bias+$frame+48] ! save %asi
bcb43bb3 181\f
6df8c74d
AP
182 sub %g0,$num,$i ! i=-num
183 sub %g0,$num,$j ! j=-num
bcb43bb3
AP
184
185 add $ap,$j,%o3
186 add $bp,$i,%o4
6df8c74d 187
87d3af64
AP
188 ld [%o3+4],%g1 ! bp[0]
189 ld [%o3+0],%o0
190 ld [%o4+4],%g5 ! ap[0]
191 sllx %g1,32,%g1
192 ld [%o4+0],%o1
193 sllx %g5,32,%g5
6df8c74d
AP
194 or %g1,%o0,%o0
195 or %g5,%o1,%o1
196
aa2be094 197 add $np,$j,%o5
bcb43bb3
AP
198
199 mulx %o1,%o0,%o0 ! ap[0]*bp[0]
200 mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
aa2be094 201 stx %o0,[%sp+$bias+$frame+0]
bcb43bb3 202
6df8c74d 203 ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
aa2be094 204 fzeros $alo
6df8c74d 205 ld [%o3+4],$ahi_
aa2be094 206 fzeros $ahi
6df8c74d 207 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
aa2be094 208 fzeros $nlo
6df8c74d 209 ld [%o5+4],$nhi_
aa2be094 210 fzeros $nhi
bcb43bb3
AP
211
212 ! transfer b[i] to FPU as 4x16-bit values
6df8c74d 213 ldda [%o4+2]%asi,$ba
bcb43bb3 214 fxtod $alo,$alo
6df8c74d 215 ldda [%o4+0]%asi,$bb
bcb43bb3 216 fxtod $ahi,$ahi
6df8c74d 217 ldda [%o4+6]%asi,$bc
bcb43bb3 218 fxtod $nlo,$nlo
6df8c74d 219 ldda [%o4+4]%asi,$bd
bcb43bb3
AP
220 fxtod $nhi,$nhi
221
222 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
aa2be094 223 ldda [%sp+$bias+$frame+6]%asi,$na
bcb43bb3 224 fxtod $ba,$ba
aa2be094 225 ldda [%sp+$bias+$frame+4]%asi,$nb
bcb43bb3 226 fxtod $bb,$bb
aa2be094 227 ldda [%sp+$bias+$frame+2]%asi,$nc
bcb43bb3 228 fxtod $bc,$bc
aa2be094 229 ldda [%sp+$bias+$frame+0]%asi,$nd
bcb43bb3
AP
230 fxtod $bd,$bd
231
232 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
233 fxtod $na,$na
234 std $ahi,[$ap_h+$j]
235 fxtod $nb,$nb
236 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
237 fxtod $nc,$nc
238 std $nhi,[$np_h+$j]
239 fxtod $nd,$nd
240
aa2be094
AP
241 fmuld $alo,$ba,$aloa
242 fmuld $nlo,$na,$nloa
243 fmuld $alo,$bb,$alob
244 fmuld $nlo,$nb,$nlob
245 fmuld $alo,$bc,$aloc
aa2be094 246 faddd $aloa,$nloa,$nloa
6df8c74d 247 fmuld $nlo,$nc,$nloc
aa2be094 248 fmuld $alo,$bd,$alod
aa2be094 249 faddd $alob,$nlob,$nlob
6df8c74d 250 fmuld $nlo,$nd,$nlod
aa2be094 251 fmuld $ahi,$ba,$ahia
aa2be094 252 faddd $aloc,$nloc,$nloc
6df8c74d 253 fmuld $nhi,$na,$nhia
aa2be094 254 fmuld $ahi,$bb,$ahib
aa2be094 255 faddd $alod,$nlod,$nlod
6df8c74d 256 fmuld $nhi,$nb,$nhib
aa2be094 257 fmuld $ahi,$bc,$ahic
aa2be094 258 faddd $ahia,$nhia,$nhia
6df8c74d 259 fmuld $nhi,$nc,$nhic
aa2be094 260 fmuld $ahi,$bd,$ahid
6df8c74d 261 faddd $ahib,$nhib,$nhib
aa2be094 262 fmuld $nhi,$nd,$nhid
bcb43bb3 263
bcb43bb3
AP
264 faddd $ahic,$nhic,$dota ! $nhic
265 faddd $ahid,$nhid,$dotb ! $nhid
266
267 faddd $nloc,$nhia,$nloc
268 faddd $nlod,$nhib,$nlod
269
270 fdtox $nloa,$nloa
271 fdtox $nlob,$nlob
272 fdtox $nloc,$nloc
273 fdtox $nlod,$nlod
274
275 std $nloa,[%sp+$bias+$frame+0]
2e21922e 276 add $j,8,$j
bcb43bb3 277 std $nlob,[%sp+$bias+$frame+8]
2e21922e 278 add $ap,$j,%o4
bcb43bb3 279 std $nloc,[%sp+$bias+$frame+16]
2e21922e 280 add $np,$j,%o5
bcb43bb3 281 std $nlod,[%sp+$bias+$frame+24]
bcb43bb3 282\f
1c3d2b94 283 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
aa2be094 284 fzeros $alo
1c3d2b94 285 ld [%o4+4],$ahi_
aa2be094 286 fzeros $ahi
1c3d2b94 287 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
aa2be094 288 fzeros $nlo
1c3d2b94 289 ld [%o5+4],$nhi_
aa2be094 290 fzeros $nhi
bcb43bb3
AP
291
292 fxtod $alo,$alo
293 fxtod $ahi,$ahi
294 fxtod $nlo,$nlo
295 fxtod $nhi,$nhi
296
2e21922e 297 ldx [%sp+$bias+$frame+0],%o0
aa2be094 298 fmuld $alo,$ba,$aloa
2e21922e 299 ldx [%sp+$bias+$frame+8],%o1
aa2be094 300 fmuld $nlo,$na,$nloa
2e21922e 301 ldx [%sp+$bias+$frame+16],%o2
aa2be094 302 fmuld $alo,$bb,$alob
2e21922e 303 ldx [%sp+$bias+$frame+24],%o3
aa2be094 304 fmuld $nlo,$nb,$nlob
2e21922e
AP
305
306 srlx %o0,16,%o7
307 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
aa2be094 308 fmuld $alo,$bc,$aloc
2e21922e
AP
309 add %o7,%o1,%o1
310 std $ahi,[$ap_h+$j]
311 faddd $aloa,$nloa,$nloa
6df8c74d 312 fmuld $nlo,$nc,$nloc
2e21922e
AP
313 srlx %o1,16,%o7
314 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
aa2be094 315 fmuld $alo,$bd,$alod
2e21922e
AP
316 add %o7,%o2,%o2
317 std $nhi,[$np_h+$j]
318 faddd $alob,$nlob,$nlob
6df8c74d 319 fmuld $nlo,$nd,$nlod
2e21922e 320 srlx %o2,16,%o7
aa2be094 321 fmuld $ahi,$ba,$ahia
2e21922e
AP
322 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
323 faddd $aloc,$nloc,$nloc
6df8c74d 324 fmuld $nhi,$na,$nhia
2e21922e
AP
325 !and %o0,$mask,%o0
326 !and %o1,$mask,%o1
327 !and %o2,$mask,%o2
328 !sllx %o1,16,%o1
329 !sllx %o2,32,%o2
330 !sllx %o3,48,%o7
331 !or %o1,%o0,%o0
332 !or %o2,%o0,%o0
333 !or %o7,%o0,%o0 ! 64-bit result
334 srlx %o3,16,%g1 ! 34-bit carry
aa2be094 335 fmuld $ahi,$bb,$ahib
2e21922e 336
aa2be094 337 faddd $alod,$nlod,$nlod
6df8c74d 338 fmuld $nhi,$nb,$nhib
aa2be094 339 fmuld $ahi,$bc,$ahic
aa2be094 340 faddd $ahia,$nhia,$nhia
6df8c74d 341 fmuld $nhi,$nc,$nhic
aa2be094 342 fmuld $ahi,$bd,$ahid
aa2be094 343 faddd $ahib,$nhib,$nhib
6df8c74d 344 fmuld $nhi,$nd,$nhid
bcb43bb3
AP
345
346 faddd $dota,$nloa,$nloa
347 faddd $dotb,$nlob,$nlob
348 faddd $ahic,$nhic,$dota ! $nhic
349 faddd $ahid,$nhid,$dotb ! $nhid
350
351 faddd $nloc,$nhia,$nloc
352 faddd $nlod,$nhib,$nlod
353
354 fdtox $nloa,$nloa
355 fdtox $nlob,$nlob
356 fdtox $nloc,$nloc
357 fdtox $nlod,$nlod
358
359 std $nloa,[%sp+$bias+$frame+0]
360 std $nlob,[%sp+$bias+$frame+8]
2e21922e 361 addcc $j,8,$j
bcb43bb3 362 std $nloc,[%sp+$bias+$frame+16]
2e21922e 363 bz,pn %icc,.L1stskip
bcb43bb3 364 std $nlod,[%sp+$bias+$frame+24]
1c3d2b94 365\f
23296942 366.align 32 ! incidentally already aligned !
1c3d2b94 367.L1st:
1c3d2b94
AP
368 add $ap,$j,%o4
369 add $np,$j,%o5
370 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
371 fzeros $alo
372 ld [%o4+4],$ahi_
373 fzeros $ahi
374 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
375 fzeros $nlo
376 ld [%o5+4],$nhi_
377 fzeros $nhi
378
379 fxtod $alo,$alo
380 fxtod $ahi,$ahi
381 fxtod $nlo,$nlo
382 fxtod $nhi,$nhi
383
2e21922e 384 ldx [%sp+$bias+$frame+0],%o0
1c3d2b94 385 fmuld $alo,$ba,$aloa
2e21922e 386 ldx [%sp+$bias+$frame+8],%o1
1c3d2b94 387 fmuld $nlo,$na,$nloa
2e21922e 388 ldx [%sp+$bias+$frame+16],%o2
1c3d2b94 389 fmuld $alo,$bb,$alob
2e21922e 390 ldx [%sp+$bias+$frame+24],%o3
1c3d2b94 391 fmuld $nlo,$nb,$nlob
2e21922e
AP
392
393 srlx %o0,16,%o7
394 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
1c3d2b94 395 fmuld $alo,$bc,$aloc
2e21922e
AP
396 add %o7,%o1,%o1
397 std $ahi,[$ap_h+$j]
398 faddd $aloa,$nloa,$nloa
1c3d2b94 399 fmuld $nlo,$nc,$nloc
2e21922e
AP
400 srlx %o1,16,%o7
401 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
1c3d2b94 402 fmuld $alo,$bd,$alod
2e21922e
AP
403 add %o7,%o2,%o2
404 std $nhi,[$np_h+$j]
405 faddd $alob,$nlob,$nlob
1c3d2b94 406 fmuld $nlo,$nd,$nlod
2e21922e 407 srlx %o2,16,%o7
1c3d2b94 408 fmuld $ahi,$ba,$ahia
2e21922e
AP
409 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
410 and %o0,$mask,%o0
411 faddd $aloc,$nloc,$nloc
1c3d2b94 412 fmuld $nhi,$na,$nhia
2e21922e
AP
413 and %o1,$mask,%o1
414 and %o2,$mask,%o2
1c3d2b94 415 fmuld $ahi,$bb,$ahib
2e21922e
AP
416 sllx %o1,16,%o1
417 faddd $alod,$nlod,$nlod
1c3d2b94 418 fmuld $nhi,$nb,$nhib
2e21922e 419 sllx %o2,32,%o2
1c3d2b94 420 fmuld $ahi,$bc,$ahic
2e21922e
AP
421 sllx %o3,48,%o7
422 or %o1,%o0,%o0
423 faddd $ahia,$nhia,$nhia
1c3d2b94 424 fmuld $nhi,$nc,$nhic
2e21922e 425 or %o2,%o0,%o0
1c3d2b94 426 fmuld $ahi,$bd,$ahid
2e21922e
AP
427 or %o7,%o0,%o0 ! 64-bit result
428 faddd $ahib,$nhib,$nhib
1c3d2b94 429 fmuld $nhi,$nd,$nhid
2e21922e
AP
430 addcc %g1,%o0,%o0
431 faddd $dota,$nloa,$nloa
432 srlx %o3,16,%g1 ! 34-bit carry
433 faddd $dotb,$nlob,$nlob
434 bcs,a %xcc,.+8
435 add %g1,1,%g1
436
437 stx %o0,[$tp] ! tp[j-1]=
1c3d2b94 438
1c3d2b94
AP
439 faddd $ahic,$nhic,$dota ! $nhic
440 faddd $ahid,$nhid,$dotb ! $nhid
441
442 faddd $nloc,$nhia,$nloc
443 faddd $nlod,$nhib,$nlod
444
445 fdtox $nloa,$nloa
446 fdtox $nlob,$nlob
447 fdtox $nloc,$nloc
448 fdtox $nlod,$nlod
449
450 std $nloa,[%sp+$bias+$frame+0]
451 std $nlob,[%sp+$bias+$frame+8]
452 std $nloc,[%sp+$bias+$frame+16]
453 std $nlod,[%sp+$bias+$frame+24]
454
aa2be094
AP
455 addcc $j,8,$j
456 bnz,pt %icc,.L1st
bcb43bb3 457 add $tp,8,$tp
1c3d2b94
AP
458\f
459.L1stskip:
ebae8092
AP
460 fdtox $dota,$dota
461 fdtox $dotb,$dotb
462
1c3d2b94
AP
463 ldx [%sp+$bias+$frame+0],%o0
464 ldx [%sp+$bias+$frame+8],%o1
465 ldx [%sp+$bias+$frame+16],%o2
466 ldx [%sp+$bias+$frame+24],%o3
467
468 srlx %o0,16,%o7
ebae8092 469 std $dota,[%sp+$bias+$frame+32]
1c3d2b94 470 add %o7,%o1,%o1
ebae8092 471 std $dotb,[%sp+$bias+$frame+40]
1c3d2b94
AP
472 srlx %o1,16,%o7
473 add %o7,%o2,%o2
474 srlx %o2,16,%o7
475 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
476 and %o0,$mask,%o0
477 and %o1,$mask,%o1
478 and %o2,$mask,%o2
479 sllx %o1,16,%o1
480 sllx %o2,32,%o2
481 sllx %o3,48,%o7
482 or %o1,%o0,%o0
483 or %o2,%o0,%o0
484 or %o7,%o0,%o0 ! 64-bit result
ebae8092 485 ldx [%sp+$bias+$frame+32],%o4
1c3d2b94 486 addcc %g1,%o0,%o0
ebae8092 487 ldx [%sp+$bias+$frame+40],%o5
1c3d2b94
AP
488 srlx %o3,16,%g1 ! 34-bit carry
489 bcs,a %xcc,.+8
490 add %g1,1,%g1
491
492 stx %o0,[$tp] ! tp[j-1]=
493 add $tp,8,$tp
bcb43bb3 494
ebae8092
AP
495 srlx %o4,16,%o7
496 add %o7,%o5,%o5
497 and %o4,$mask,%o4
498 sllx %o5,16,%o7
499 or %o7,%o4,%o4
500 addcc %g1,%o4,%o4
501 srlx %o5,48,%g1
bcb43bb3
AP
502 bcs,a %xcc,.+8
503 add %g1,1,%g1
504
505 mov %g1,$carry
ebae8092 506 stx %o4,[$tp] ! tp[num-1]=
bcb43bb3
AP
507\f
508 ba .Louter
509 add $i,8,$i
510.align 32
511.Louter:
6df8c74d 512 sub %g0,$num,$j ! j=-num
bcb43bb3
AP
513 add %sp,$bias+$frame+$locals,$tp
514
87d3af64 515 add $ap,$j,%o3
bcb43bb3 516 add $bp,$i,%o4
6df8c74d 517
87d3af64
AP
518 ld [%o3+4],%g1 ! bp[i]
519 ld [%o3+0],%o0
520 ld [%o4+4],%g5 ! ap[0]
521 sllx %g1,32,%g1
522 ld [%o4+0],%o1
523 sllx %g5,32,%g5
6df8c74d
AP
524 or %g1,%o0,%o0
525 or %g5,%o1,%o1
526
bcb43bb3
AP
527 ldx [$tp],%o2 ! tp[0]
528 mulx %o1,%o0,%o0
529 addcc %o2,%o0,%o0
530 mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
aa2be094 531 stx %o0,[%sp+$bias+$frame+0]
bcb43bb3 532
bcb43bb3 533 ! transfer b[i] to FPU as 4x16-bit values
6df8c74d
AP
534 ldda [%o4+2]%asi,$ba
535 ldda [%o4+0]%asi,$bb
536 ldda [%o4+6]%asi,$bc
537 ldda [%o4+4]%asi,$bd
bcb43bb3
AP
538
539 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
aa2be094 540 ldda [%sp+$bias+$frame+6]%asi,$na
bcb43bb3 541 fxtod $ba,$ba
aa2be094 542 ldda [%sp+$bias+$frame+4]%asi,$nb
bcb43bb3 543 fxtod $bb,$bb
aa2be094 544 ldda [%sp+$bias+$frame+2]%asi,$nc
bcb43bb3 545 fxtod $bc,$bc
aa2be094 546 ldda [%sp+$bias+$frame+0]%asi,$nd
bcb43bb3
AP
547 fxtod $bd,$bd
548 ldd [$ap_l+$j],$alo ! load a[j] in double format
549 fxtod $na,$na
550 ldd [$ap_h+$j],$ahi
551 fxtod $nb,$nb
552 ldd [$np_l+$j],$nlo ! load n[j] in double format
553 fxtod $nc,$nc
554 ldd [$np_h+$j],$nhi
555 fxtod $nd,$nd
556
aa2be094
AP
557 fmuld $alo,$ba,$aloa
558 fmuld $nlo,$na,$nloa
559 fmuld $alo,$bb,$alob
560 fmuld $nlo,$nb,$nlob
561 fmuld $alo,$bc,$aloc
aa2be094 562 faddd $aloa,$nloa,$nloa
6df8c74d 563 fmuld $nlo,$nc,$nloc
aa2be094 564 fmuld $alo,$bd,$alod
aa2be094 565 faddd $alob,$nlob,$nlob
6df8c74d 566 fmuld $nlo,$nd,$nlod
aa2be094 567 fmuld $ahi,$ba,$ahia
aa2be094 568 faddd $aloc,$nloc,$nloc
6df8c74d 569 fmuld $nhi,$na,$nhia
aa2be094 570 fmuld $ahi,$bb,$ahib
aa2be094 571 faddd $alod,$nlod,$nlod
6df8c74d 572 fmuld $nhi,$nb,$nhib
aa2be094 573 fmuld $ahi,$bc,$ahic
aa2be094 574 faddd $ahia,$nhia,$nhia
6df8c74d 575 fmuld $nhi,$nc,$nhic
aa2be094 576 fmuld $ahi,$bd,$ahid
6df8c74d 577 faddd $ahib,$nhib,$nhib
aa2be094 578 fmuld $nhi,$nd,$nhid
bcb43bb3 579
bcb43bb3
AP
580 faddd $ahic,$nhic,$dota ! $nhic
581 faddd $ahid,$nhid,$dotb ! $nhid
582
583 faddd $nloc,$nhia,$nloc
584 faddd $nlod,$nhib,$nlod
585
586 fdtox $nloa,$nloa
587 fdtox $nlob,$nlob
588 fdtox $nloc,$nloc
589 fdtox $nlod,$nlod
590
591 std $nloa,[%sp+$bias+$frame+0]
592 std $nlob,[%sp+$bias+$frame+8]
593 std $nloc,[%sp+$bias+$frame+16]
2e21922e 594 add $j,8,$j
bcb43bb3 595 std $nlod,[%sp+$bias+$frame+24]
2e21922e
AP
596\f
597 ldd [$ap_l+$j],$alo ! load a[j] in double format
598 ldd [$ap_h+$j],$ahi
599 ldd [$np_l+$j],$nlo ! load n[j] in double format
600 ldd [$np_h+$j],$nhi
601
602 fmuld $alo,$ba,$aloa
603 fmuld $nlo,$na,$nloa
604 fmuld $alo,$bb,$alob
605 fmuld $nlo,$nb,$nlob
606 fmuld $alo,$bc,$aloc
bcb43bb3 607 ldx [%sp+$bias+$frame+0],%o0
2e21922e
AP
608 faddd $aloa,$nloa,$nloa
609 fmuld $nlo,$nc,$nloc
bcb43bb3 610 ldx [%sp+$bias+$frame+8],%o1
2e21922e 611 fmuld $alo,$bd,$alod
bcb43bb3 612 ldx [%sp+$bias+$frame+16],%o2
2e21922e
AP
613 faddd $alob,$nlob,$nlob
614 fmuld $nlo,$nd,$nlod
bcb43bb3 615 ldx [%sp+$bias+$frame+24],%o3
2e21922e 616 fmuld $ahi,$ba,$ahia
bcb43bb3
AP
617
618 srlx %o0,16,%o7
2e21922e
AP
619 faddd $aloc,$nloc,$nloc
620 fmuld $nhi,$na,$nhia
bcb43bb3 621 add %o7,%o1,%o1
2e21922e 622 fmuld $ahi,$bb,$ahib
bcb43bb3 623 srlx %o1,16,%o7
2e21922e
AP
624 faddd $alod,$nlod,$nlod
625 fmuld $nhi,$nb,$nhib
bcb43bb3 626 add %o7,%o2,%o2
2e21922e 627 fmuld $ahi,$bc,$ahic
bcb43bb3 628 srlx %o2,16,%o7
2e21922e
AP
629 faddd $ahia,$nhia,$nhia
630 fmuld $nhi,$nc,$nhic
bcb43bb3
AP
631 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
632 ! why?
633 and %o0,$mask,%o0
2e21922e 634 fmuld $ahi,$bd,$ahid
bcb43bb3
AP
635 and %o1,$mask,%o1
636 and %o2,$mask,%o2
2e21922e
AP
637 faddd $ahib,$nhib,$nhib
638 fmuld $nhi,$nd,$nhid
bcb43bb3 639 sllx %o1,16,%o1
2e21922e 640 faddd $dota,$nloa,$nloa
bcb43bb3 641 sllx %o2,32,%o2
2e21922e 642 faddd $dotb,$nlob,$nlob
bcb43bb3
AP
643 sllx %o3,48,%o7
644 or %o1,%o0,%o0
2e21922e 645 faddd $ahic,$nhic,$dota ! $nhic
bcb43bb3 646 or %o2,%o0,%o0
2e21922e 647 faddd $ahid,$nhid,$dotb ! $nhid
bcb43bb3
AP
648 or %o7,%o0,%o0 ! 64-bit result
649 ldx [$tp],%o7
2e21922e 650 faddd $nloc,$nhia,$nloc
bcb43bb3
AP
651 addcc %o7,%o0,%o0
652 ! end-of-why?
2e21922e 653 faddd $nlod,$nhib,$nlod
bcb43bb3 654 srlx %o3,16,%g1 ! 34-bit carry
2e21922e 655 fdtox $nloa,$nloa
bcb43bb3
AP
656 bcs,a %xcc,.+8
657 add %g1,1,%g1
bcb43bb3 658
bcb43bb3
AP
659 fdtox $nlob,$nlob
660 fdtox $nloc,$nloc
661 fdtox $nlod,$nlod
662
663 std $nloa,[%sp+$bias+$frame+0]
664 std $nlob,[%sp+$bias+$frame+8]
2e21922e 665 addcc $j,8,$j
bcb43bb3 666 std $nloc,[%sp+$bias+$frame+16]
2e21922e 667 bz,pn %icc,.Linnerskip
bcb43bb3 668 std $nlod,[%sp+$bias+$frame+24]
1c3d2b94 669\f
ebae8092
AP
670 ba .Linner
671 nop
672.align 32
1c3d2b94 673.Linner:
2e21922e
AP
674 ldd [$ap_l+$j],$alo ! load a[j] in double format
675 ldd [$ap_h+$j],$ahi
676 ldd [$np_l+$j],$nlo ! load n[j] in double format
677 ldd [$np_h+$j],$nhi
678
679 fmuld $alo,$ba,$aloa
680 fmuld $nlo,$na,$nloa
681 fmuld $alo,$bb,$alob
682 fmuld $nlo,$nb,$nlob
683 fmuld $alo,$bc,$aloc
bcb43bb3 684 ldx [%sp+$bias+$frame+0],%o0
2e21922e
AP
685 faddd $aloa,$nloa,$nloa
686 fmuld $nlo,$nc,$nloc
bcb43bb3 687 ldx [%sp+$bias+$frame+8],%o1
2e21922e 688 fmuld $alo,$bd,$alod
bcb43bb3 689 ldx [%sp+$bias+$frame+16],%o2
2e21922e
AP
690 faddd $alob,$nlob,$nlob
691 fmuld $nlo,$nd,$nlod
bcb43bb3 692 ldx [%sp+$bias+$frame+24],%o3
2e21922e 693 fmuld $ahi,$ba,$ahia
bcb43bb3
AP
694
695 srlx %o0,16,%o7
2e21922e
AP
696 faddd $aloc,$nloc,$nloc
697 fmuld $nhi,$na,$nhia
bcb43bb3 698 add %o7,%o1,%o1
2e21922e 699 fmuld $ahi,$bb,$ahib
bcb43bb3 700 srlx %o1,16,%o7
2e21922e
AP
701 faddd $alod,$nlod,$nlod
702 fmuld $nhi,$nb,$nhib
bcb43bb3 703 add %o7,%o2,%o2
2e21922e 704 fmuld $ahi,$bc,$ahic
bcb43bb3 705 srlx %o2,16,%o7
2e21922e
AP
706 faddd $ahia,$nhia,$nhia
707 fmuld $nhi,$nc,$nhic
bcb43bb3
AP
708 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
709 and %o0,$mask,%o0
2e21922e 710 fmuld $ahi,$bd,$ahid
bcb43bb3
AP
711 and %o1,$mask,%o1
712 and %o2,$mask,%o2
2e21922e
AP
713 faddd $ahib,$nhib,$nhib
714 fmuld $nhi,$nd,$nhid
bcb43bb3 715 sllx %o1,16,%o1
2e21922e 716 faddd $dota,$nloa,$nloa
bcb43bb3 717 sllx %o2,32,%o2
2e21922e 718 faddd $dotb,$nlob,$nlob
bcb43bb3
AP
719 sllx %o3,48,%o7
720 or %o1,%o0,%o0
2e21922e 721 faddd $ahic,$nhic,$dota ! $nhic
bcb43bb3 722 or %o2,%o0,%o0
2e21922e 723 faddd $ahid,$nhid,$dotb ! $nhid
bcb43bb3 724 or %o7,%o0,%o0 ! 64-bit result
2e21922e 725 faddd $nloc,$nhia,$nloc
bcb43bb3 726 addcc %g1,%o0,%o0
ebae8092 727 ldx [$tp+8],%o7 ! tp[j]
2e21922e 728 faddd $nlod,$nhib,$nlod
bcb43bb3 729 srlx %o3,16,%g1 ! 34-bit carry
2e21922e 730 fdtox $nloa,$nloa
bcb43bb3
AP
731 bcs,a %xcc,.+8
732 add %g1,1,%g1
2e21922e 733 fdtox $nlob,$nlob
bcb43bb3 734 addcc %o7,%o0,%o0
2e21922e 735 fdtox $nloc,$nloc
bcb43bb3
AP
736 bcs,a %xcc,.+8
737 add %g1,1,%g1
738
739 stx %o0,[$tp] ! tp[j-1]
2e21922e 740 fdtox $nlod,$nlod
1c3d2b94
AP
741
742 std $nloa,[%sp+$bias+$frame+0]
743 std $nlob,[%sp+$bias+$frame+8]
744 std $nloc,[%sp+$bias+$frame+16]
aa2be094 745 addcc $j,8,$j
2e21922e 746 std $nlod,[%sp+$bias+$frame+24]
aa2be094 747 bnz,pt %icc,.Linner
bcb43bb3 748 add $tp,8,$tp
1c3d2b94
AP
749\f
750.Linnerskip:
2e21922e
AP
751 fdtox $dota,$dota
752 fdtox $dotb,$dotb
753
1c3d2b94
AP
754 ldx [%sp+$bias+$frame+0],%o0
755 ldx [%sp+$bias+$frame+8],%o1
756 ldx [%sp+$bias+$frame+16],%o2
757 ldx [%sp+$bias+$frame+24],%o3
758
759 srlx %o0,16,%o7
2e21922e 760 std $dota,[%sp+$bias+$frame+32]
1c3d2b94 761 add %o7,%o1,%o1
2e21922e 762 std $dotb,[%sp+$bias+$frame+40]
1c3d2b94
AP
763 srlx %o1,16,%o7
764 add %o7,%o2,%o2
765 srlx %o2,16,%o7
766 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
767 and %o0,$mask,%o0
768 and %o1,$mask,%o1
769 and %o2,$mask,%o2
770 sllx %o1,16,%o1
771 sllx %o2,32,%o2
772 sllx %o3,48,%o7
773 or %o1,%o0,%o0
774 or %o2,%o0,%o0
2e21922e 775 ldx [%sp+$bias+$frame+32],%o4
1c3d2b94 776 or %o7,%o0,%o0 ! 64-bit result
2e21922e 777 ldx [%sp+$bias+$frame+40],%o5
1c3d2b94 778 addcc %g1,%o0,%o0
2e21922e 779 ldx [$tp+8],%o7 ! tp[j]
1c3d2b94
AP
780 srlx %o3,16,%g1 ! 34-bit carry
781 bcs,a %xcc,.+8
782 add %g1,1,%g1
783
1c3d2b94
AP
784 addcc %o7,%o0,%o0
785 bcs,a %xcc,.+8
786 add %g1,1,%g1
787
788 stx %o0,[$tp] ! tp[j-1]
789 add $tp,8,$tp
bcb43bb3 790
2e21922e
AP
791 srlx %o4,16,%o7
792 add %o7,%o5,%o5
793 and %o4,$mask,%o4
794 sllx %o5,16,%o7
795 or %o7,%o4,%o4
796 addcc %g1,%o4,%o4
797 srlx %o5,48,%g1
bcb43bb3
AP
798 bcs,a %xcc,.+8
799 add %g1,1,%g1
800
2e21922e
AP
801 addcc $carry,%o4,%o4
802 stx %o4,[$tp] ! tp[num-1]
bcb43bb3
AP
803 mov %g1,$carry
804 bcs,a %xcc,.+8
805 add $carry,1,$carry
806
aa2be094
AP
807 addcc $i,8,$i
808 bnz %icc,.Louter
bcb43bb3
AP
809 nop
810\f
7d9cf7c0 811 add $tp,8,$tp ! adjust tp to point at the end
7d9cf7c0 812 orn %g0,%g0,%g4
7d9cf7c0 813 sub %g0,$num,%o7 ! n=-num
23296942 814 ba .Lsub
673c55a2 815 subcc %g0,%g0,%g0 ! clear %icc.c
23296942
AP
816
817.align 32
bcb43bb3 818.Lsub:
87d3af64
AP
819 ldx [$tp+%o7],%o0
820 add $np,%o7,%g1
821 ld [%g1+0],%o2
822 ld [%g1+4],%o3
823 srlx %o0,32,%o1
824 subccc %o0,%o2,%o2
825 add $rp,%o7,%g1
826 subccc %o1,%o3,%o3
827 st %o2,[%g1+0]
aa2be094
AP
828 add %o7,8,%o7
829 brnz,pt %o7,.Lsub
87d3af64 830 st %o3,[%g1+4]
7d9cf7c0 831 subc $carry,0,%g4
6df8c74d 832 sub %g0,$num,%o7 ! n=-num
23296942
AP
833 ba .Lcopy
834 nop
bcb43bb3 835
23296942 836.align 32
bcb43bb3 837.Lcopy:
aa2be094 838 ldx [$tp+%o7],%o0
87d3af64 839 add $rp,%o7,%g1
7d9cf7c0
AP
840 ld [%g1+0],%o2
841 ld [%g1+4],%o3
842 stx %g0,[$tp+%o7]
843 and %o0,%g4,%o0
844 srlx %o0,32,%o1
845 andn %o2,%g4,%o2
846 andn %o3,%g4,%o3
847 or %o2,%o0,%o0
848 or %o3,%o1,%o1
87d3af64 849 st %o0,[%g1+0]
aa2be094
AP
850 add %o7,8,%o7
851 brnz,pt %o7,.Lcopy
87d3af64 852 st %o1,[%g1+4]
6df8c74d 853 sub %g0,$num,%o7 ! n=-num
bcb43bb3 854
bcb43bb3 855.Lzap:
aa2be094
AP
856 stx %g0,[$ap_l+%o7]
857 stx %g0,[$ap_h+%o7]
858 stx %g0,[$np_l+%o7]
859 stx %g0,[$np_h+%o7]
860 add %o7,8,%o7
861 brnz,pt %o7,.Lzap
bcb43bb3
AP
862 nop
863
864 ldx [%sp+$bias+$frame+48],%o7
865 wr %g0,%o7,%asi ! restore %asi
866
867 mov 1,%i0
aa2be094 868.Lret:
bcb43bb3
AP
869 ret
870 restore
871.type $fname,#function
872.size $fname,(.-$fname)
46f4e1be 873.asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
23296942 874.align 32
bcb43bb3
AP
875___
876
877$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3b4a0225
AP
878
879# Below substitution makes it possible to compile without demanding
478b50cf 880# VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
3b4a0225
AP
881# dare to do this, because VIS capability is detected at run-time now
882# and this routine is not called on CPU not capable to execute it. Do
883# note that fzeros is not the only VIS dependency! Another dependency
884# is implicit and is just _a_ numerical value loaded to %asi register,
885# which assembler can't recognize as VIS specific...
886$code =~ s/fzeros\s+%f([0-9]+)/
887 sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
888 /gem;
889
bcb43bb3 890print $code;
3b4a0225 891# flush
a21314db 892close STDOUT or die "error closing STDOUT: $!";