]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/bn/asm/sparct4-mont.pl
2df872b4a8f760558d96cd4e530fdcfe3fd6d36e
[thirdparty/openssl.git] / crypto / bn / asm / sparct4-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by David S. Miller and Andy Polyakov
12 # The module is licensed under 2-clause BSD license.
13 # November 2012. All rights reserved.
14 # ====================================================================
15
16 ######################################################################
17 # Montgomery squaring-n-multiplication module for SPARC T4.
18 #
19 # The module consists of three parts:
20 #
21 # 1) collection of "single-op" subroutines that perform single
22 # operation, Montgomery squaring or multiplication, on 512-,
23 # 1024-, 1536- and 2048-bit operands;
24 # 2) collection of "multi-op" subroutines that perform 5 squaring and
25 # 1 multiplication operations on operands of above lengths;
26 # 3) fall-back and helper VIS3 subroutines.
27 #
28 # RSA sign is dominated by multi-op subroutine, while RSA verify and
29 # DSA - by single-op. Special note about 4096-bit RSA verify result.
30 # Operands are too long for dedicated hardware and it's handled by
31 # VIS3 code, which is why you don't see any improvement. It's surely
32 # possible to improve it [by deploying 'mpmul' instruction], maybe in
33 # the future...
34 #
35 # Performance improvement.
36 #
37 # 64-bit process, VIS3:
38 # sign verify sign/s verify/s
39 # rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4
40 # rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3
41 # rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9
42 # dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9
43 # dsa 2048 bits 0.001056s 0.001233s 946.9 810.8
44 #
45 # 64-bit process, this module:
46 # sign verify sign/s verify/s
47 # rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9
48 # rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7
49 # rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5
50 # dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5
51 # dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6
52 #
53 ######################################################################
54 # 32-bit process, VIS3:
55 # sign verify sign/s verify/s
56 # rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3
57 # rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4
58 # rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8
59 # dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6
60 # dsa 2048 bits 0.001101s 0.001260s 908.2 793.4
61 #
62 # 32-bit process, this module:
63 # sign verify sign/s verify/s
64 # rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0
65 # rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7
66 # rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4
67 # dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2
68 # dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2
69 #
70 # 32-bit code is prone to performance degradation as interrupt rate
71 # dispatched to CPU executing the code grows. This is because in
72 # standard process of handling interrupt in 32-bit process context
73 # upper halves of most integer registers used as input or output are
74 # zeroed. This renders result invalid, and operation has to be re-run.
75 # If CPU is "bothered" with timer interrupts only, the penalty is
76 # hardly measurable. But in order to mitigate this problem for higher
77 # interrupt rates contemporary Linux kernel recognizes biased stack
78 # even in 32-bit process context and preserves full register contents.
79 # See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
80 # for details.
81
82 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
83 push(@INC,"${dir}","${dir}../../perlasm");
84 require "sparcv9_modes.pl";
85
86 $output = pop and open STDOUT,">$output";
87
88 $code.=<<___;
89 #include "sparc_arch.h"
90
91 #ifdef __arch64__
92 .register %g2,#scratch
93 .register %g3,#scratch
94 #endif
95
96 .section ".text",#alloc,#execinstr
97
98 #ifdef __PIC__
99 SPARC_PIC_THUNK(%g1)
100 #endif
101 ___
102
103 ########################################################################
104 # Register layout for mont[mul|sqr] instructions.
105 # For details see "Oracle SPARC Architecture 2011" manual at
106 # http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
107 #
108 my @R=map("%f".2*$_,(0..11,30,31,12..29));
109 my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
110 my @A=(@N[0..13],@R[14..31]);
111 my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
112 \f
113 ########################################################################
114 # int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
115 # const u64 *np,const BN_ULONG *n0);
116 #
117 sub generate_bn_mul_mont_t4() {
118 my $NUM=shift;
119 my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
120
121 $code.=<<___;
122 .globl bn_mul_mont_t4_$NUM
123 .align 32
124 bn_mul_mont_t4_$NUM:
125 #ifdef __arch64__
126 mov 0,$sentinel
127 mov -128,%g4
128 #elif defined(SPARCV9_64BIT_STACK)
129 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
130 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
131 mov -2047,%g4
132 and %g1,SPARCV9_64BIT_STACK,%g1
133 movrz %g1,0,%g4
134 mov -1,$sentinel
135 add %g4,-128,%g4
136 #else
137 mov -1,$sentinel
138 mov -128,%g4
139 #endif
140 sllx $sentinel,32,$sentinel
141 save %sp,%g4,%sp
142 #ifndef __arch64__
143 save %sp,-128,%sp ! warm it up
144 save %sp,-128,%sp
145 save %sp,-128,%sp
146 save %sp,-128,%sp
147 save %sp,-128,%sp
148 save %sp,-128,%sp
149 restore
150 restore
151 restore
152 restore
153 restore
154 restore
155 #endif
156 and %sp,1,%g4
157 or $sentinel,%fp,%fp
158 or %g4,$sentinel,$sentinel
159
160 ! copy arguments to global registers
161 mov %i0,$rp
162 mov %i1,$ap
163 mov %i2,$bp
164 mov %i3,$np
165 ld [%i4+0],%f1 ! load *n0
166 ld [%i4+4],%f0
167 fsrc2 %f0,%f60
168 ___
169 \f
170 # load ap[$NUM] ########################################################
171 $code.=<<___;
172 save %sp,-128,%sp; or $sentinel,%fp,%fp
173 ___
174 for($i=0; $i<14 && $i<$NUM; $i++) {
175 my $lo=$i<13?@A[$i+1]:"%o7";
176 $code.=<<___;
177 ld [$ap+$i*8+0],$lo
178 ld [$ap+$i*8+4],@A[$i]
179 sllx @A[$i],32,@A[$i]
180 or $lo,@A[$i],@A[$i]
181 ___
182 }
183 for(; $i<$NUM; $i++) {
184 my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
185 $code.=<<___;
186 ld [$ap+$i*8+0],$lo
187 ld [$ap+$i*8+4],$hi
188 fsrc2 $hi,@A[$i]
189 ___
190 }
191 # load np[$NUM] ########################################################
192 $code.=<<___;
193 save %sp,-128,%sp; or $sentinel,%fp,%fp
194 ___
195 for($i=0; $i<14 && $i<$NUM; $i++) {
196 my $lo=$i<13?@N[$i+1]:"%o7";
197 $code.=<<___;
198 ld [$np+$i*8+0],$lo
199 ld [$np+$i*8+4],@N[$i]
200 sllx @N[$i],32,@N[$i]
201 or $lo,@N[$i],@N[$i]
202 ___
203 }
204 $code.=<<___;
205 save %sp,-128,%sp; or $sentinel,%fp,%fp
206 ___
207 for(; $i<28 && $i<$NUM; $i++) {
208 my $lo=$i<27?@N[$i+1]:"%o7";
209 $code.=<<___;
210 ld [$np+$i*8+0],$lo
211 ld [$np+$i*8+4],@N[$i]
212 sllx @N[$i],32,@N[$i]
213 or $lo,@N[$i],@N[$i]
214 ___
215 }
216 $code.=<<___;
217 save %sp,-128,%sp; or $sentinel,%fp,%fp
218 ___
219 for(; $i<$NUM; $i++) {
220 my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
221 $code.=<<___;
222 ld [$np+$i*8+0],$lo
223 ld [$np+$i*8+4],@N[$i]
224 sllx @N[$i],32,@N[$i]
225 or $lo,@N[$i],@N[$i]
226 ___
227 }
228 $code.=<<___;
229 cmp $ap,$bp
230 be SIZE_T_CC,.Lmsquare_$NUM
231 nop
232 ___
233 \f
234 # load bp[$NUM] ########################################################
235 $code.=<<___;
236 save %sp,-128,%sp; or $sentinel,%fp,%fp
237 ___
238 for($i=0; $i<14 && $i<$NUM; $i++) {
239 my $lo=$i<13?@B[$i+1]:"%o7";
240 $code.=<<___;
241 ld [$bp+$i*8+0],$lo
242 ld [$bp+$i*8+4],@B[$i]
243 sllx @B[$i],32,@B[$i]
244 or $lo,@B[$i],@B[$i]
245 ___
246 }
247 $code.=<<___;
248 save %sp,-128,%sp; or $sentinel,%fp,%fp
249 ___
250 for(; $i<$NUM; $i++) {
251 my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
252 $code.=<<___;
253 ld [$bp+$i*8+0],$lo
254 ld [$bp+$i*8+4],@B[$i]
255 sllx @B[$i],32,@B[$i]
256 or $lo,@B[$i],@B[$i]
257 ___
258 }
259 # magic ################################################################
260 $code.=<<___;
261 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
262 .Lmresume_$NUM:
263 fbu,pn %fcc3,.Lmabort_$NUM
264 #ifndef __arch64__
265 and %fp,$sentinel,$sentinel
266 brz,pn $sentinel,.Lmabort_$NUM
267 #endif
268 nop
269 #ifdef __arch64__
270 restore
271 restore
272 restore
273 restore
274 restore
275 #else
276 restore; and %fp,$sentinel,$sentinel
277 restore; and %fp,$sentinel,$sentinel
278 restore; and %fp,$sentinel,$sentinel
279 restore; and %fp,$sentinel,$sentinel
280 brz,pn $sentinel,.Lmabort1_$NUM
281 restore
282 #endif
283 ___
284 \f
285 # save tp[$NUM] ########################################################
286 for($i=0; $i<14 && $i<$NUM; $i++) {
287 $code.=<<___;
288 movxtod @A[$i],@R[$i]
289 ___
290 }
291 $code.=<<___;
292 #ifdef __arch64__
293 restore
294 #else
295 and %fp,$sentinel,$sentinel
296 restore
297 and $sentinel,1,%o7
298 and %fp,$sentinel,$sentinel
299 srl %fp,0,%fp ! just in case?
300 or %o7,$sentinel,$sentinel
301 brz,a,pn $sentinel,.Lmdone_$NUM
302 mov 0,%i0 ! return failure
303 #endif
304 ___
305 for($i=0; $i<12 && $i<$NUM; $i++) {
306 @R[$i] =~ /%f([0-9]+)/;
307 my $lo = "%f".($1+1);
308 $code.=<<___;
309 st $lo,[$rp+$i*8+0]
310 st @R[$i],[$rp+$i*8+4]
311 ___
312 }
313 for(; $i<$NUM; $i++) {
314 my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
315 $code.=<<___;
316 fsrc2 @R[$i],$hi
317 st $lo,[$rp+$i*8+0]
318 st $hi,[$rp+$i*8+4]
319 ___
320 }
321 $code.=<<___;
322 mov 1,%i0 ! return success
323 .Lmdone_$NUM:
324 ret
325 restore
326
327 .Lmabort_$NUM:
328 restore
329 restore
330 restore
331 restore
332 restore
333 .Lmabort1_$NUM:
334 restore
335
336 mov 0,%i0 ! return failure
337 ret
338 restore
339
340 .align 32
341 .Lmsquare_$NUM:
342 save %sp,-128,%sp; or $sentinel,%fp,%fp
343 save %sp,-128,%sp; or $sentinel,%fp,%fp
344 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
345 ba .Lmresume_$NUM
346 nop
347 .type bn_mul_mont_t4_$NUM, #function
348 .size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
349 ___
350 }
351
352 for ($i=8;$i<=32;$i+=8) {
353 &generate_bn_mul_mont_t4($i);
354 }
355 \f
356 ########################################################################
357 #
358 sub load_ccr {
359 my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
360 $code.=<<___;
361 srl $pwr, 2, %o4
362 and $pwr, 3, %o5
363 and %o4, 7, %o4
364 sll %o5, 3, %o5 ! offset within first cache line
365 add %o5, $ptbl, $ptbl ! of the pwrtbl
366 or %g0, 1, %o5
367 sll %o5, %o4, $ccr
368 ___
369 $code.=<<___ if (!$skip_wr);
370 wr $ccr, %g0, %ccr
371 ___
372 }
373 sub load_b_pair {
374 my ($pwrtbl,$B0,$B1)=@_;
375
376 $code.=<<___;
377 ldx [$pwrtbl+0*32], $B0
378 ldx [$pwrtbl+8*32], $B1
379 ldx [$pwrtbl+1*32], %o4
380 ldx [$pwrtbl+9*32], %o5
381 movvs %icc, %o4, $B0
382 ldx [$pwrtbl+2*32], %o4
383 movvs %icc, %o5, $B1
384 ldx [$pwrtbl+10*32],%o5
385 move %icc, %o4, $B0
386 ldx [$pwrtbl+3*32], %o4
387 move %icc, %o5, $B1
388 ldx [$pwrtbl+11*32],%o5
389 movneg %icc, %o4, $B0
390 ldx [$pwrtbl+4*32], %o4
391 movneg %icc, %o5, $B1
392 ldx [$pwrtbl+12*32],%o5
393 movcs %xcc, %o4, $B0
394 ldx [$pwrtbl+5*32],%o4
395 movcs %xcc, %o5, $B1
396 ldx [$pwrtbl+13*32],%o5
397 movvs %xcc, %o4, $B0
398 ldx [$pwrtbl+6*32], %o4
399 movvs %xcc, %o5, $B1
400 ldx [$pwrtbl+14*32],%o5
401 move %xcc, %o4, $B0
402 ldx [$pwrtbl+7*32], %o4
403 move %xcc, %o5, $B1
404 ldx [$pwrtbl+15*32],%o5
405 movneg %xcc, %o4, $B0
406 add $pwrtbl,16*32, $pwrtbl
407 movneg %xcc, %o5, $B1
408 ___
409 }
410 sub load_b {
411 my ($pwrtbl,$Bi)=@_;
412
413 $code.=<<___;
414 ldx [$pwrtbl+0*32], $Bi
415 ldx [$pwrtbl+1*32], %o4
416 ldx [$pwrtbl+2*32], %o5
417 movvs %icc, %o4, $Bi
418 ldx [$pwrtbl+3*32], %o4
419 move %icc, %o5, $Bi
420 ldx [$pwrtbl+4*32], %o5
421 movneg %icc, %o4, $Bi
422 ldx [$pwrtbl+5*32], %o4
423 movcs %xcc, %o5, $Bi
424 ldx [$pwrtbl+6*32], %o5
425 movvs %xcc, %o4, $Bi
426 ldx [$pwrtbl+7*32], %o4
427 move %xcc, %o5, $Bi
428 add $pwrtbl,8*32, $pwrtbl
429 movneg %xcc, %o4, $Bi
430 ___
431 }
432 \f
433 ########################################################################
434 # int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
435 # const u64 *pwrtbl,int pwr,int stride);
436 #
437 sub generate_bn_pwr5_mont_t4() {
438 my $NUM=shift;
439 my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
440
441 $code.=<<___;
442 .globl bn_pwr5_mont_t4_$NUM
443 .align 32
444 bn_pwr5_mont_t4_$NUM:
445 #ifdef __arch64__
446 mov 0,$sentinel
447 mov -128,%g4
448 #elif defined(SPARCV9_64BIT_STACK)
449 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
450 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
451 mov -2047,%g4
452 and %g1,SPARCV9_64BIT_STACK,%g1
453 movrz %g1,0,%g4
454 mov -1,$sentinel
455 add %g4,-128,%g4
456 #else
457 mov -1,$sentinel
458 mov -128,%g4
459 #endif
460 sllx $sentinel,32,$sentinel
461 save %sp,%g4,%sp
462 #ifndef __arch64__
463 save %sp,-128,%sp ! warm it up
464 save %sp,-128,%sp
465 save %sp,-128,%sp
466 save %sp,-128,%sp
467 save %sp,-128,%sp
468 save %sp,-128,%sp
469 restore
470 restore
471 restore
472 restore
473 restore
474 restore
475 #endif
476 and %sp,1,%g4
477 or $sentinel,%fp,%fp
478 or %g4,$sentinel,$sentinel
479
480 ! copy arguments to global registers
481 mov %i0,$tp
482 mov %i1,$np
483 ld [%i2+0],%f1 ! load *n0
484 ld [%i2+4],%f0
485 mov %i3,$pwrtbl
486 srl %i4,%g0,%i4 ! pack last arguments
487 sllx %i5,32,$pwr
488 or %i4,$pwr,$pwr
489 fsrc2 %f0,%f60
490 ___
491 \f
492 # load tp[$NUM] ########################################################
493 $code.=<<___;
494 save %sp,-128,%sp; or $sentinel,%fp,%fp
495 ___
496 for($i=0; $i<14 && $i<$NUM; $i++) {
497 $code.=<<___;
498 ldx [$tp+$i*8],@A[$i]
499 ___
500 }
501 for(; $i<$NUM; $i++) {
502 $code.=<<___;
503 ldd [$tp+$i*8],@A[$i]
504 ___
505 }
506 # load np[$NUM] ########################################################
507 $code.=<<___;
508 save %sp,-128,%sp; or $sentinel,%fp,%fp
509 ___
510 for($i=0; $i<14 && $i<$NUM; $i++) {
511 $code.=<<___;
512 ldx [$np+$i*8],@N[$i]
513 ___
514 }
515 $code.=<<___;
516 save %sp,-128,%sp; or $sentinel,%fp,%fp
517 ___
518 for(; $i<28 && $i<$NUM; $i++) {
519 $code.=<<___;
520 ldx [$np+$i*8],@N[$i]
521 ___
522 }
523 $code.=<<___;
524 save %sp,-128,%sp; or $sentinel,%fp,%fp
525 ___
526 for(; $i<$NUM; $i++) {
527 $code.=<<___;
528 ldx [$np+$i*8],@N[$i]
529 ___
530 }
531 # load pwrtbl[pwr] ########################################################
532 $code.=<<___;
533 save %sp,-128,%sp; or $sentinel,%fp,%fp
534
535 srlx $pwr, 32, %o4 ! unpack $pwr
536 srl $pwr, %g0, %o5
537 sub %o4, 5, %o4
538 mov $pwrtbl, %o7
539 sllx %o4, 32, $pwr ! re-pack $pwr
540 or %o5, $pwr, $pwr
541 srl %o5, %o4, %o5
542 ___
543 &load_ccr("%o7","%o5","%o4");
544 $code.=<<___;
545 b .Lstride_$NUM
546 nop
547 .align 16
548 .Lstride_$NUM:
549 ___
550 for($i=0; $i<14 && $i<$NUM; $i+=2) {
551 &load_b_pair("%o7",@B[$i],@B[$i+1]);
552 }
553 $code.=<<___;
554 save %sp,-128,%sp; or $sentinel,%fp,%fp
555 ___
556 for(; $i<$NUM; $i+=2) {
557 &load_b_pair("%i7",@B[$i],@B[$i+1]);
558 }
559 $code.=<<___;
560 srax $pwr, 32, %o4 ! unpack $pwr
561 srl $pwr, %g0, %o5
562 sub %o4, 5, %o4
563 mov $pwrtbl, %i7
564 sllx %o4, 32, $pwr ! re-pack $pwr
565 or %o5, $pwr, $pwr
566 srl %o5, %o4, %o5
567 ___
568 &load_ccr("%i7","%o5","%o4",1);
569 \f
570 # magic ################################################################
571 for($i=0; $i<5; $i++) {
572 $code.=<<___;
573 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
574 fbu,pn %fcc3,.Labort_$NUM
575 #ifndef __arch64__
576 and %fp,$sentinel,$sentinel
577 brz,pn $sentinel,.Labort_$NUM
578 #endif
579 nop
580 ___
581 }
582 $code.=<<___;
583 wr %o4, %g0, %ccr
584 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
585 fbu,pn %fcc3,.Labort_$NUM
586 #ifndef __arch64__
587 and %fp,$sentinel,$sentinel
588 brz,pn $sentinel,.Labort_$NUM
589 #endif
590
591 srax $pwr, 32, %o4
592 #ifdef __arch64__
593 brgez %o4,.Lstride_$NUM
594 restore
595 restore
596 restore
597 restore
598 restore
599 #else
600 brgez %o4,.Lstride_$NUM
601 restore; and %fp,$sentinel,$sentinel
602 restore; and %fp,$sentinel,$sentinel
603 restore; and %fp,$sentinel,$sentinel
604 restore; and %fp,$sentinel,$sentinel
605 brz,pn $sentinel,.Labort1_$NUM
606 restore
607 #endif
608 ___
609 \f
610 # save tp[$NUM] ########################################################
611 for($i=0; $i<14 && $i<$NUM; $i++) {
612 $code.=<<___;
613 movxtod @A[$i],@R[$i]
614 ___
615 }
616 $code.=<<___;
617 #ifdef __arch64__
618 restore
619 #else
620 and %fp,$sentinel,$sentinel
621 restore
622 and $sentinel,1,%o7
623 and %fp,$sentinel,$sentinel
624 srl %fp,0,%fp ! just in case?
625 or %o7,$sentinel,$sentinel
626 brz,a,pn $sentinel,.Ldone_$NUM
627 mov 0,%i0 ! return failure
628 #endif
629 ___
630 for($i=0; $i<$NUM; $i++) {
631 $code.=<<___;
632 std @R[$i],[$tp+$i*8]
633 ___
634 }
635 $code.=<<___;
636 mov 1,%i0 ! return success
637 .Ldone_$NUM:
638 ret
639 restore
640
641 .Labort_$NUM:
642 restore
643 restore
644 restore
645 restore
646 restore
647 .Labort1_$NUM:
648 restore
649
650 mov 0,%i0 ! return failure
651 ret
652 restore
653 .type bn_pwr5_mont_t4_$NUM, #function
654 .size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
655 ___
656 }
657
658 for ($i=8;$i<=32;$i+=8) {
659 &generate_bn_pwr5_mont_t4($i);
660 }
661 \f
662 {
663 ########################################################################
664 # Fall-back subroutines
665 #
666 # copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
667 #
668 ($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
669 (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
670
671 # int bn_mul_mont(
672 $rp="%o0"; # u64 *rp,
673 $ap="%o1"; # const u64 *ap,
674 $bp="%o2"; # const u64 *bp,
675 $np="%o3"; # const u64 *np,
676 $n0p="%o4"; # const BN_ULONG *n0,
677 $num="%o5"; # int num); # caller ensures that num is >=3
678 $code.=<<___;
679 .globl bn_mul_mont_t4
680 .align 32
681 bn_mul_mont_t4:
682 add %sp, STACK_BIAS, %g4 ! real top of stack
683 sll $num, 3, $num ! size in bytes
684 add $num, 63, %g1
685 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
686 sub %g4, %g1, %g1
687 andn %g1, 63, %g1 ! align at 64 byte
688 sub %g1, STACK_FRAME, %g1 ! new top of stack
689 sub %g1, %g4, %g1
690
691 save %sp, %g1, %sp
692 ___
693 # +-------------------------------+<----- %sp
694 # . .
695 # +-------------------------------+<----- aligned at 64 bytes
696 # | __int64 tmp[0] |
697 # +-------------------------------+
698 # . .
699 # . .
700 # +-------------------------------+<----- aligned at 64 bytes
701 # . .
702 ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
703 ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
704 ($ovf,$i)=($t0,$t1);
705 $code.=<<___;
706 ld [$n0p+0], $t0 ! pull n0[0..1] value
707 ld [$n0p+4], $t1
708 add %sp, STACK_BIAS+STACK_FRAME, $tp
709 ldx [$bp+0], $m0 ! m0=bp[0]
710 sllx $t1, 32, $n0
711 add $bp, 8, $bp
712 or $t0, $n0, $n0
713 \f
714 ldx [$ap+0], $aj ! ap[0]
715
716 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
717 umulxhi $aj, $m0, $hi0
718
719 ldx [$ap+8], $aj ! ap[1]
720 add $ap, 16, $ap
721 ldx [$np+0], $nj ! np[0]
722
723 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
724
725 mulx $aj, $m0, $alo ! ap[1]*bp[0]
726 umulxhi $aj, $m0, $aj ! ahi=aj
727
728 mulx $nj, $m1, $lo1 ! np[0]*m1
729 umulxhi $nj, $m1, $hi1
730
731 ldx [$np+8], $nj ! np[1]
732
733 addcc $lo0, $lo1, $lo1
734 add $np, 16, $np
735 addxc %g0, $hi1, $hi1
736
737 mulx $nj, $m1, $nlo ! np[1]*m1
738 umulxhi $nj, $m1, $nj ! nhi=nj
739 \f
740 ba .L1st
741 sub $num, 24, $cnt ! cnt=num-3
742
743 .align 16
744 .L1st:
745 addcc $alo, $hi0, $lo0
746 addxc $aj, %g0, $hi0
747
748 ldx [$ap+0], $aj ! ap[j]
749 addcc $nlo, $hi1, $lo1
750 add $ap, 8, $ap
751 addxc $nj, %g0, $hi1 ! nhi=nj
752
753 ldx [$np+0], $nj ! np[j]
754 mulx $aj, $m0, $alo ! ap[j]*bp[0]
755 add $np, 8, $np
756 umulxhi $aj, $m0, $aj ! ahi=aj
757
758 mulx $nj, $m1, $nlo ! np[j]*m1
759 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
760 umulxhi $nj, $m1, $nj ! nhi=nj
761 addxc %g0, $hi1, $hi1
762 stxa $lo1, [$tp]0xe2 ! tp[j-1]
763 add $tp, 8, $tp ! tp++
764
765 brnz,pt $cnt, .L1st
766 sub $cnt, 8, $cnt ! j--
767 !.L1st
768 addcc $alo, $hi0, $lo0
769 addxc $aj, %g0, $hi0 ! ahi=aj
770
771 addcc $nlo, $hi1, $lo1
772 addxc $nj, %g0, $hi1
773 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
774 addxc %g0, $hi1, $hi1
775 stxa $lo1, [$tp]0xe2 ! tp[j-1]
776 add $tp, 8, $tp
777
778 addcc $hi0, $hi1, $hi1
779 addxc %g0, %g0, $ovf ! upmost overflow bit
780 stxa $hi1, [$tp]0xe2
781 add $tp, 8, $tp
782 \f
783 ba .Louter
784 sub $num, 16, $i ! i=num-2
785
786 .align 16
787 .Louter:
788 ldx [$bp+0], $m0 ! m0=bp[i]
789 add $bp, 8, $bp
790
791 sub $ap, $num, $ap ! rewind
792 sub $np, $num, $np
793 sub $tp, $num, $tp
794
795 ldx [$ap+0], $aj ! ap[0]
796 ldx [$np+0], $nj ! np[0]
797
798 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
799 ldx [$tp], $tj ! tp[0]
800 umulxhi $aj, $m0, $hi0
801 ldx [$ap+8], $aj ! ap[1]
802 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
803 mulx $aj, $m0, $alo ! ap[1]*bp[i]
804 addxc %g0, $hi0, $hi0
805 mulx $lo0, $n0, $m1 ! tp[0]*n0
806 umulxhi $aj, $m0, $aj ! ahi=aj
807 mulx $nj, $m1, $lo1 ! np[0]*m1
808 add $ap, 16, $ap
809 umulxhi $nj, $m1, $hi1
810 ldx [$np+8], $nj ! np[1]
811 add $np, 16, $np
812 addcc $lo1, $lo0, $lo1
813 mulx $nj, $m1, $nlo ! np[1]*m1
814 addxc %g0, $hi1, $hi1
815 umulxhi $nj, $m1, $nj ! nhi=nj
816 \f
817 ba .Linner
818 sub $num, 24, $cnt ! cnt=num-3
819 .align 16
820 .Linner:
821 addcc $alo, $hi0, $lo0
822 ldx [$tp+8], $tj ! tp[j]
823 addxc $aj, %g0, $hi0 ! ahi=aj
824 ldx [$ap+0], $aj ! ap[j]
825 add $ap, 8, $ap
826 addcc $nlo, $hi1, $lo1
827 mulx $aj, $m0, $alo ! ap[j]*bp[i]
828 addxc $nj, %g0, $hi1 ! nhi=nj
829 ldx [$np+0], $nj ! np[j]
830 add $np, 8, $np
831 umulxhi $aj, $m0, $aj ! ahi=aj
832 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
833 mulx $nj, $m1, $nlo ! np[j]*m1
834 addxc %g0, $hi0, $hi0
835 umulxhi $nj, $m1, $nj ! nhi=nj
836 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
837 addxc %g0, $hi1, $hi1
838 stx $lo1, [$tp] ! tp[j-1]
839 add $tp, 8, $tp
840 brnz,pt $cnt, .Linner
841 sub $cnt, 8, $cnt
842 !.Linner
843 ldx [$tp+8], $tj ! tp[j]
844 addcc $alo, $hi0, $lo0
845 addxc $aj, %g0, $hi0 ! ahi=aj
846 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
847 addxc %g0, $hi0, $hi0
848
849 addcc $nlo, $hi1, $lo1
850 addxc $nj, %g0, $hi1 ! nhi=nj
851 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
852 addxc %g0, $hi1, $hi1
853 stx $lo1, [$tp] ! tp[j-1]
854
855 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
856 addxccc $hi1, $hi0, $hi1
857 addxc %g0, %g0, $ovf
858 stx $hi1, [$tp+8]
859 add $tp, 16, $tp
860
861 brnz,pt $i, .Louter
862 sub $i, 8, $i
863 \f
864 sub $ap, $num, $ap ! rewind
865 sub $np, $num, $np
866 sub $tp, $num, $tp
867 ba .Lsub
868 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
869
870 .align 16
871 .Lsub:
872 ldx [$tp], $tj
873 add $tp, 8, $tp
874 ldx [$np+0], $nj
875 add $np, 8, $np
876 subccc $tj, $nj, $t2 ! tp[j]-np[j]
877 srlx $tj, 32, $tj
878 srlx $nj, 32, $nj
879 subccc $tj, $nj, $t3
880 add $rp, 8, $rp
881 st $t2, [$rp-4] ! reverse order
882 st $t3, [$rp-8]
883 brnz,pt $cnt, .Lsub
884 sub $cnt, 8, $cnt
885
886 sub $np, $num, $np ! rewind
887 sub $tp, $num, $tp
888 sub $rp, $num, $rp
889
890 subccc $ovf, %g0, $ovf ! handle upmost overflow bit
891 ba .Lcopy
892 sub $num, 8, $cnt
893
894 .align 16
895 .Lcopy: ! conditional copy
896 ldx [$tp], $tj
897 ldx [$rp+0], $t2
898 stx %g0, [$tp] ! zap
899 add $tp, 8, $tp
900 movcs %icc, $tj, $t2
901 stx $t2, [$rp+0]
902 add $rp, 8, $rp
903 brnz $cnt, .Lcopy
904 sub $cnt, 8, $cnt
905
906 mov 1, %o0
907 ret
908 restore
909 .type bn_mul_mont_t4, #function
910 .size bn_mul_mont_t4, .-bn_mul_mont_t4
911 ___
912 \f
913 # int bn_mul_mont_gather5(
914 $rp="%o0"; # u64 *rp,
915 $ap="%o1"; # const u64 *ap,
916 $bp="%o2"; # const u64 *pwrtbl,
917 $np="%o3"; # const u64 *np,
918 $n0p="%o4"; # const BN_ULONG *n0,
919 $num="%o5"; # int num, # caller ensures that num is >=3
920 # int power);
921 $code.=<<___;
922 .globl bn_mul_mont_gather5_t4
923 .align 32
924 bn_mul_mont_gather5_t4:
925 add %sp, STACK_BIAS, %g4 ! real top of stack
926 sll $num, 3, $num ! size in bytes
927 add $num, 63, %g1
928 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
929 sub %g4, %g1, %g1
930 andn %g1, 63, %g1 ! align at 64 byte
931 sub %g1, STACK_FRAME, %g1 ! new top of stack
932 sub %g1, %g4, %g1
933 LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument
934
935 save %sp, %g1, %sp
936 ___
937 # +-------------------------------+<----- %sp
938 # . .
939 # +-------------------------------+<----- aligned at 64 bytes
940 # | __int64 tmp[0] |
941 # +-------------------------------+
942 # . .
943 # . .
944 # +-------------------------------+<----- aligned at 64 bytes
945 # . .
946 ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
947 ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
948 ($ovf,$i)=($t0,$t1);
949 &load_ccr($bp,"%g4",$ccr);
950 &load_b($bp,$m0,"%o7"); # m0=bp[0]
951
952 $code.=<<___;
953 ld [$n0p+0], $t0 ! pull n0[0..1] value
954 ld [$n0p+4], $t1
955 add %sp, STACK_BIAS+STACK_FRAME, $tp
956 sllx $t1, 32, $n0
957 or $t0, $n0, $n0
958 \f
959 ldx [$ap+0], $aj ! ap[0]
960
961 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
962 umulxhi $aj, $m0, $hi0
963
964 ldx [$ap+8], $aj ! ap[1]
965 add $ap, 16, $ap
966 ldx [$np+0], $nj ! np[0]
967
968 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
969
970 mulx $aj, $m0, $alo ! ap[1]*bp[0]
971 umulxhi $aj, $m0, $aj ! ahi=aj
972
973 mulx $nj, $m1, $lo1 ! np[0]*m1
974 umulxhi $nj, $m1, $hi1
975
976 ldx [$np+8], $nj ! np[1]
977
978 addcc $lo0, $lo1, $lo1
979 add $np, 16, $np
980 addxc %g0, $hi1, $hi1
981
982 mulx $nj, $m1, $nlo ! np[1]*m1
983 umulxhi $nj, $m1, $nj ! nhi=nj
984 \f
985 ba .L1st_g5
986 sub $num, 24, $cnt ! cnt=num-3
987
988 .align 16
989 .L1st_g5:
990 addcc $alo, $hi0, $lo0
991 addxc $aj, %g0, $hi0
992
993 ldx [$ap+0], $aj ! ap[j]
994 addcc $nlo, $hi1, $lo1
995 add $ap, 8, $ap
996 addxc $nj, %g0, $hi1 ! nhi=nj
997
998 ldx [$np+0], $nj ! np[j]
999 mulx $aj, $m0, $alo ! ap[j]*bp[0]
1000 add $np, 8, $np
1001 umulxhi $aj, $m0, $aj ! ahi=aj
1002
1003 mulx $nj, $m1, $nlo ! np[j]*m1
1004 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
1005 umulxhi $nj, $m1, $nj ! nhi=nj
1006 addxc %g0, $hi1, $hi1
1007 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1008 add $tp, 8, $tp ! tp++
1009
1010 brnz,pt $cnt, .L1st_g5
1011 sub $cnt, 8, $cnt ! j--
1012 !.L1st_g5
1013 addcc $alo, $hi0, $lo0
1014 addxc $aj, %g0, $hi0 ! ahi=aj
1015
1016 addcc $nlo, $hi1, $lo1
1017 addxc $nj, %g0, $hi1
1018 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
1019 addxc %g0, $hi1, $hi1
1020 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1021 add $tp, 8, $tp
1022
1023 addcc $hi0, $hi1, $hi1
1024 addxc %g0, %g0, $ovf ! upmost overflow bit
1025 stxa $hi1, [$tp]0xe2
1026 add $tp, 8, $tp
1027 \f
1028 ba .Louter_g5
1029 sub $num, 16, $i ! i=num-2
1030
1031 .align 16
1032 .Louter_g5:
1033 wr $ccr, %g0, %ccr
1034 ___
1035 &load_b($bp,$m0); # m0=bp[i]
1036 $code.=<<___;
1037 sub $ap, $num, $ap ! rewind
1038 sub $np, $num, $np
1039 sub $tp, $num, $tp
1040
1041 ldx [$ap+0], $aj ! ap[0]
1042 ldx [$np+0], $nj ! np[0]
1043
1044 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
1045 ldx [$tp], $tj ! tp[0]
1046 umulxhi $aj, $m0, $hi0
1047 ldx [$ap+8], $aj ! ap[1]
1048 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
1049 mulx $aj, $m0, $alo ! ap[1]*bp[i]
1050 addxc %g0, $hi0, $hi0
1051 mulx $lo0, $n0, $m1 ! tp[0]*n0
1052 umulxhi $aj, $m0, $aj ! ahi=aj
1053 mulx $nj, $m1, $lo1 ! np[0]*m1
1054 add $ap, 16, $ap
1055 umulxhi $nj, $m1, $hi1
1056 ldx [$np+8], $nj ! np[1]
1057 add $np, 16, $np
1058 addcc $lo1, $lo0, $lo1
1059 mulx $nj, $m1, $nlo ! np[1]*m1
1060 addxc %g0, $hi1, $hi1
1061 umulxhi $nj, $m1, $nj ! nhi=nj
1062 \f
1063 ba .Linner_g5
1064 sub $num, 24, $cnt ! cnt=num-3
1065 .align 16
1066 .Linner_g5:
1067 addcc $alo, $hi0, $lo0
1068 ldx [$tp+8], $tj ! tp[j]
1069 addxc $aj, %g0, $hi0 ! ahi=aj
1070 ldx [$ap+0], $aj ! ap[j]
1071 add $ap, 8, $ap
1072 addcc $nlo, $hi1, $lo1
1073 mulx $aj, $m0, $alo ! ap[j]*bp[i]
1074 addxc $nj, %g0, $hi1 ! nhi=nj
1075 ldx [$np+0], $nj ! np[j]
1076 add $np, 8, $np
1077 umulxhi $aj, $m0, $aj ! ahi=aj
1078 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1079 mulx $nj, $m1, $nlo ! np[j]*m1
1080 addxc %g0, $hi0, $hi0
1081 umulxhi $nj, $m1, $nj ! nhi=nj
1082 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1083 addxc %g0, $hi1, $hi1
1084 stx $lo1, [$tp] ! tp[j-1]
1085 add $tp, 8, $tp
1086 brnz,pt $cnt, .Linner_g5
1087 sub $cnt, 8, $cnt
1088 !.Linner_g5
1089 ldx [$tp+8], $tj ! tp[j]
1090 addcc $alo, $hi0, $lo0
1091 addxc $aj, %g0, $hi0 ! ahi=aj
1092 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1093 addxc %g0, $hi0, $hi0
1094
1095 addcc $nlo, $hi1, $lo1
1096 addxc $nj, %g0, $hi1 ! nhi=nj
1097 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1098 addxc %g0, $hi1, $hi1
1099 stx $lo1, [$tp] ! tp[j-1]
1100
1101 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
1102 addxccc $hi1, $hi0, $hi1
1103 addxc %g0, %g0, $ovf
1104 stx $hi1, [$tp+8]
1105 add $tp, 16, $tp
1106
1107 brnz,pt $i, .Louter_g5
1108 sub $i, 8, $i
1109 \f
1110 sub $ap, $num, $ap ! rewind
1111 sub $np, $num, $np
1112 sub $tp, $num, $tp
1113 ba .Lsub_g5
1114 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
1115
1116 .align 16
1117 .Lsub_g5:
1118 ldx [$tp], $tj
1119 add $tp, 8, $tp
1120 ldx [$np+0], $nj
1121 add $np, 8, $np
1122 subccc $tj, $nj, $t2 ! tp[j]-np[j]
1123 srlx $tj, 32, $tj
1124 srlx $nj, 32, $nj
1125 subccc $tj, $nj, $t3
1126 add $rp, 8, $rp
1127 st $t2, [$rp-4] ! reverse order
1128 st $t3, [$rp-8]
1129 brnz,pt $cnt, .Lsub_g5
1130 sub $cnt, 8, $cnt
1131
1132 sub $np, $num, $np ! rewind
1133 sub $tp, $num, $tp
1134 sub $rp, $num, $rp
1135
1136 subccc $ovf, %g0, $ovf ! handle upmost overflow bit
1137 ba .Lcopy_g5
1138 sub $num, 8, $cnt
1139
1140 .align 16
1141 .Lcopy_g5: ! conditional copy
1142 ldx [$tp], $tj
1143 ldx [$rp+0], $t2
1144 stx %g0, [$tp] ! zap
1145 add $tp, 8, $tp
1146 movcs %icc, $tj, $t2
1147 stx $t2, [$rp+0]
1148 add $rp, 8, $rp
1149 brnz $cnt, .Lcopy_g5
1150 sub $cnt, 8, $cnt
1151
1152 mov 1, %o0
1153 ret
1154 restore
1155 .type bn_mul_mont_gather5_t4, #function
1156 .size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
1157 ___
1158 }
1159 \f
1160 $code.=<<___;
1161 .globl bn_flip_t4
1162 .align 32
1163 bn_flip_t4:
1164 .Loop_flip:
1165 ld [%o1+0], %o4
1166 sub %o2, 1, %o2
1167 ld [%o1+4], %o5
1168 add %o1, 8, %o1
1169 st %o5, [%o0+0]
1170 st %o4, [%o0+4]
1171 brnz %o2, .Loop_flip
1172 add %o0, 8, %o0
1173 retl
1174 nop
1175 .type bn_flip_t4, #function
1176 .size bn_flip_t4, .-bn_flip_t4
1177
1178 .globl bn_flip_n_scatter5_t4
1179 .align 32
1180 bn_flip_n_scatter5_t4:
1181 sll %o3, 3, %o3
1182 srl %o1, 1, %o1
1183 add %o3, %o2, %o2 ! &pwrtbl[pwr]
1184 sub %o1, 1, %o1
1185 .Loop_flip_n_scatter5:
1186 ld [%o0+0], %o4 ! inp[i]
1187 ld [%o0+4], %o5
1188 add %o0, 8, %o0
1189 sllx %o5, 32, %o5
1190 or %o4, %o5, %o5
1191 stx %o5, [%o2]
1192 add %o2, 32*8, %o2
1193 brnz %o1, .Loop_flip_n_scatter5
1194 sub %o1, 1, %o1
1195 retl
1196 nop
1197 .type bn_flip_n_scatter5_t4, #function
1198 .size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
1199
1200 .globl bn_gather5_t4
1201 .align 32
1202 bn_gather5_t4:
1203 ___
1204 &load_ccr("%o2","%o3","%g1");
1205 $code.=<<___;
1206 sub %o1, 1, %o1
1207 .Loop_gather5:
1208 ___
1209 &load_b("%o2","%g1");
1210 $code.=<<___;
1211 stx %g1, [%o0]
1212 add %o0, 8, %o0
1213 brnz %o1, .Loop_gather5
1214 sub %o1, 1, %o1
1215
1216 retl
1217 nop
1218 .type bn_gather5_t4, #function
1219 .size bn_gather5_t4, .-bn_gather5_t4
1220
1221 .asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
1222 .align 4
1223 ___
1224
1225 &emit_assembler();
1226
1227 close STDOUT or die "error closing STDOUT: $!";