]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/sparct4-mont.pl
Also check for errors in x86_64-xlate.pl.
[thirdparty/openssl.git] / crypto / bn / asm / sparct4-mont.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
83cf7abf 2# Copyright 2012-2018 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
367ace68 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
68c06bf6
AP
9
10# ====================================================================
e3713c36
RS
11# Written by David S. Miller and Andy Polyakov
12# The module is licensed under 2-clause BSD license.
13# November 2012. All rights reserved.
68c06bf6
AP
14# ====================================================================
15
16######################################################################
17# Montgomery squaring-n-multiplication module for SPARC T4.
18#
19# The module consists of three parts:
20#
21# 1) collection of "single-op" subroutines that perform single
22# operation, Montgomery squaring or multiplication, on 512-,
23# 1024-, 1536- and 2048-bit operands;
24# 2) collection of "multi-op" subroutines that perform 5 squaring and
25# 1 multiplication operations on operands of above lengths;
26# 3) fall-back and helper VIS3 subroutines.
27#
28# RSA sign is dominated by multi-op subroutine, while RSA verify and
29# DSA - by single-op. Special note about 4096-bit RSA verify result.
30# Operands are too long for dedicated hardware and it's handled by
31# VIS3 code, which is why you don't see any improvement. It's surely
32# possible to improve it [by deploying 'mpmul' instruction], maybe in
33# the future...
34#
35# Performance improvement.
36#
37# 64-bit process, VIS3:
38# sign verify sign/s verify/s
4ddacd99
AP
39# rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4
40# rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3
41# rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9
68c06bf6
AP
42# dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9
43# dsa 2048 bits 0.001056s 0.001233s 946.9 810.8
44#
45# 64-bit process, this module:
46# sign verify sign/s verify/s
4ddacd99
AP
47# rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9
48# rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7
49# rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5
50# dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5
51# dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6
68c06bf6
AP
52#
53######################################################################
54# 32-bit process, VIS3:
55# sign verify sign/s verify/s
4ddacd99
AP
56# rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3
57# rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4
58# rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8
59# dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6
60# dsa 2048 bits 0.001101s 0.001260s 908.2 793.4
68c06bf6
AP
61#
62# 32-bit process, this module:
63# sign verify sign/s verify/s
4ddacd99
AP
64# rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0
65# rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7
66# rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4
67# dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2
68# dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2
68c06bf6
AP
69#
70# 32-bit code is prone to performance degradation as interrupt rate
71# dispatched to CPU executing the code grows. This is because in
72# standard process of handling interrupt in 32-bit process context
73# upper halves of most integer registers used as input or output are
74# zeroed. This renders result invalid, and operation has to be re-run.
75# If CPU is "bothered" with timer interrupts only, the penalty is
76# hardly measurable. But in order to mitigate this problem for higher
77# interrupt rates contemporary Linux kernel recognizes biased stack
78# even in 32-bit process context and preserves full register contents.
4ddacd99 79# See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
68c06bf6
AP
80# for details.
81
82$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
83push(@INC,"${dir}","${dir}../../perlasm");
84require "sparcv9_modes.pl";
85
1aa89a7a 86$output = pop and open STDOUT,">$output";
6bd7a4d9 87
68c06bf6
AP
88$code.=<<___;
89#include "sparc_arch.h"
90
91#ifdef __arch64__
92.register %g2,#scratch
93.register %g3,#scratch
94#endif
95
96.section ".text",#alloc,#execinstr
97
98#ifdef __PIC__
99SPARC_PIC_THUNK(%g1)
100#endif
101___
102
103########################################################################
104# Register layout for mont[mul|sqr] instructions.
105# For details see "Oracle SPARC Architecture 2011" manual at
106# http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
107#
108my @R=map("%f".2*$_,(0..11,30,31,12..29));
109my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
68c06bf6 110my @A=(@N[0..13],@R[14..31]);
4ddacd99 111my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
68c06bf6
AP
112\f
113########################################################################
114# int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
115# const u64 *np,const BN_ULONG *n0);
116#
117sub generate_bn_mul_mont_t4() {
118my $NUM=shift;
119my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
120
121$code.=<<___;
122.globl bn_mul_mont_t4_$NUM
123.align 32
124bn_mul_mont_t4_$NUM:
125#ifdef __arch64__
126 mov 0,$sentinel
127 mov -128,%g4
128#elif defined(SPARCV9_64BIT_STACK)
129 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
130 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
131 mov -2047,%g4
132 and %g1,SPARCV9_64BIT_STACK,%g1
133 movrz %g1,0,%g4
134 mov -1,$sentinel
135 add %g4,-128,%g4
136#else
137 mov -1,$sentinel
138 mov -128,%g4
139#endif
140 sllx $sentinel,32,$sentinel
141 save %sp,%g4,%sp
142#ifndef __arch64__
143 save %sp,-128,%sp ! warm it up
144 save %sp,-128,%sp
145 save %sp,-128,%sp
146 save %sp,-128,%sp
147 save %sp,-128,%sp
148 save %sp,-128,%sp
149 restore
150 restore
151 restore
152 restore
153 restore
154 restore
155#endif
156 and %sp,1,%g4
157 or $sentinel,%fp,%fp
158 or %g4,$sentinel,$sentinel
159
160 ! copy arguments to global registers
161 mov %i0,$rp
162 mov %i1,$ap
163 mov %i2,$bp
164 mov %i3,$np
165 ld [%i4+0],%f1 ! load *n0
166 ld [%i4+4],%f0
167 fsrc2 %f0,%f60
168___
169\f
170# load ap[$NUM] ########################################################
171$code.=<<___;
172 save %sp,-128,%sp; or $sentinel,%fp,%fp
173___
174for($i=0; $i<14 && $i<$NUM; $i++) {
175my $lo=$i<13?@A[$i+1]:"%o7";
176$code.=<<___;
177 ld [$ap+$i*8+0],$lo
178 ld [$ap+$i*8+4],@A[$i]
179 sllx @A[$i],32,@A[$i]
180 or $lo,@A[$i],@A[$i]
181___
182}
183for(; $i<$NUM; $i++) {
184my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
185$code.=<<___;
186 ld [$ap+$i*8+0],$lo
187 ld [$ap+$i*8+4],$hi
188 fsrc2 $hi,@A[$i]
189___
190}
191# load np[$NUM] ########################################################
192$code.=<<___;
193 save %sp,-128,%sp; or $sentinel,%fp,%fp
194___
195for($i=0; $i<14 && $i<$NUM; $i++) {
196my $lo=$i<13?@N[$i+1]:"%o7";
197$code.=<<___;
198 ld [$np+$i*8+0],$lo
199 ld [$np+$i*8+4],@N[$i]
200 sllx @N[$i],32,@N[$i]
201 or $lo,@N[$i],@N[$i]
202___
203}
204$code.=<<___;
205 save %sp,-128,%sp; or $sentinel,%fp,%fp
206___
207for(; $i<28 && $i<$NUM; $i++) {
208my $lo=$i<27?@N[$i+1]:"%o7";
209$code.=<<___;
210 ld [$np+$i*8+0],$lo
211 ld [$np+$i*8+4],@N[$i]
212 sllx @N[$i],32,@N[$i]
213 or $lo,@N[$i],@N[$i]
214___
215}
216$code.=<<___;
217 save %sp,-128,%sp; or $sentinel,%fp,%fp
218___
219for(; $i<$NUM; $i++) {
220my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
221$code.=<<___;
222 ld [$np+$i*8+0],$lo
223 ld [$np+$i*8+4],@N[$i]
224 sllx @N[$i],32,@N[$i]
225 or $lo,@N[$i],@N[$i]
226___
227}
228$code.=<<___;
229 cmp $ap,$bp
230 be SIZE_T_CC,.Lmsquare_$NUM
231 nop
232___
233\f
234# load bp[$NUM] ########################################################
68c06bf6
AP
235$code.=<<___;
236 save %sp,-128,%sp; or $sentinel,%fp,%fp
237___
4ddacd99
AP
238for($i=0; $i<14 && $i<$NUM; $i++) {
239my $lo=$i<13?@B[$i+1]:"%o7";
68c06bf6
AP
240$code.=<<___;
241 ld [$bp+$i*8+0],$lo
242 ld [$bp+$i*8+4],@B[$i]
243 sllx @B[$i],32,@B[$i]
244 or $lo,@B[$i],@B[$i]
245___
246}
247$code.=<<___;
248 save %sp,-128,%sp; or $sentinel,%fp,%fp
249___
250for(; $i<$NUM; $i++) {
251my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
252$code.=<<___;
253 ld [$bp+$i*8+0],$lo
254 ld [$bp+$i*8+4],@B[$i]
255 sllx @B[$i],32,@B[$i]
256 or $lo,@B[$i],@B[$i]
257___
258}
259# magic ################################################################
260$code.=<<___;
261 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
262.Lmresume_$NUM:
263 fbu,pn %fcc3,.Lmabort_$NUM
264#ifndef __arch64__
265 and %fp,$sentinel,$sentinel
266 brz,pn $sentinel,.Lmabort_$NUM
267#endif
268 nop
269#ifdef __arch64__
270 restore
271 restore
272 restore
273 restore
274 restore
275#else
276 restore; and %fp,$sentinel,$sentinel
277 restore; and %fp,$sentinel,$sentinel
278 restore; and %fp,$sentinel,$sentinel
279 restore; and %fp,$sentinel,$sentinel
280 brz,pn $sentinel,.Lmabort1_$NUM
281 restore
282#endif
283___
284\f
285# save tp[$NUM] ########################################################
286for($i=0; $i<14 && $i<$NUM; $i++) {
287$code.=<<___;
288 movxtod @A[$i],@R[$i]
289___
290}
291$code.=<<___;
292#ifdef __arch64__
293 restore
294#else
295 and %fp,$sentinel,$sentinel
296 restore
297 and $sentinel,1,%o7
298 and %fp,$sentinel,$sentinel
299 srl %fp,0,%fp ! just in case?
300 or %o7,$sentinel,$sentinel
301 brz,a,pn $sentinel,.Lmdone_$NUM
302 mov 0,%i0 ! return failure
303#endif
304___
305for($i=0; $i<12 && $i<$NUM; $i++) {
306@R[$i] =~ /%f([0-9]+)/;
307my $lo = "%f".($1+1);
308$code.=<<___;
309 st $lo,[$rp+$i*8+0]
310 st @R[$i],[$rp+$i*8+4]
311___
312}
313for(; $i<$NUM; $i++) {
314my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
315$code.=<<___;
316 fsrc2 @R[$i],$hi
317 st $lo,[$rp+$i*8+0]
318 st $hi,[$rp+$i*8+4]
319___
320}
321$code.=<<___;
322 mov 1,%i0 ! return success
323.Lmdone_$NUM:
324 ret
325 restore
326
327.Lmabort_$NUM:
328 restore
329 restore
330 restore
331 restore
332 restore
333.Lmabort1_$NUM:
334 restore
335
336 mov 0,%i0 ! return failure
337 ret
338 restore
339
340.align 32
341.Lmsquare_$NUM:
342 save %sp,-128,%sp; or $sentinel,%fp,%fp
343 save %sp,-128,%sp; or $sentinel,%fp,%fp
344 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
345 ba .Lmresume_$NUM
346 nop
347.type bn_mul_mont_t4_$NUM, #function
348.size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
349___
350}
351
352for ($i=8;$i<=32;$i+=8) {
353 &generate_bn_mul_mont_t4($i);
354}
355\f
356########################################################################
357#
4ddacd99
AP
358sub load_ccr {
359my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
360$code.=<<___;
361 srl $pwr, 2, %o4
362 and $pwr, 3, %o5
363 and %o4, 7, %o4
364 sll %o5, 3, %o5 ! offset within first cache line
365 add %o5, $ptbl, $ptbl ! of the pwrtbl
366 or %g0, 1, %o5
367 sll %o5, %o4, $ccr
368___
369$code.=<<___ if (!$skip_wr);
370 wr $ccr, %g0, %ccr
371___
372}
373sub load_b_pair {
374my ($pwrtbl,$B0,$B1)=@_;
375
68c06bf6 376$code.=<<___;
4ddacd99
AP
377 ldx [$pwrtbl+0*32], $B0
378 ldx [$pwrtbl+8*32], $B1
379 ldx [$pwrtbl+1*32], %o4
380 ldx [$pwrtbl+9*32], %o5
381 movvs %icc, %o4, $B0
382 ldx [$pwrtbl+2*32], %o4
383 movvs %icc, %o5, $B1
384 ldx [$pwrtbl+10*32],%o5
385 move %icc, %o4, $B0
386 ldx [$pwrtbl+3*32], %o4
387 move %icc, %o5, $B1
388 ldx [$pwrtbl+11*32],%o5
389 movneg %icc, %o4, $B0
390 ldx [$pwrtbl+4*32], %o4
391 movneg %icc, %o5, $B1
392 ldx [$pwrtbl+12*32],%o5
393 movcs %xcc, %o4, $B0
394 ldx [$pwrtbl+5*32],%o4
395 movcs %xcc, %o5, $B1
396 ldx [$pwrtbl+13*32],%o5
397 movvs %xcc, %o4, $B0
398 ldx [$pwrtbl+6*32], %o4
399 movvs %xcc, %o5, $B1
400 ldx [$pwrtbl+14*32],%o5
401 move %xcc, %o4, $B0
402 ldx [$pwrtbl+7*32], %o4
403 move %xcc, %o5, $B1
404 ldx [$pwrtbl+15*32],%o5
405 movneg %xcc, %o4, $B0
406 add $pwrtbl,16*32, $pwrtbl
407 movneg %xcc, %o5, $B1
68c06bf6
AP
408___
409}
4ddacd99
AP
410sub load_b {
411my ($pwrtbl,$Bi)=@_;
412
68c06bf6 413$code.=<<___;
4ddacd99
AP
414 ldx [$pwrtbl+0*32], $Bi
415 ldx [$pwrtbl+1*32], %o4
416 ldx [$pwrtbl+2*32], %o5
417 movvs %icc, %o4, $Bi
418 ldx [$pwrtbl+3*32], %o4
419 move %icc, %o5, $Bi
420 ldx [$pwrtbl+4*32], %o5
421 movneg %icc, %o4, $Bi
422 ldx [$pwrtbl+5*32], %o4
423 movcs %xcc, %o5, $Bi
424 ldx [$pwrtbl+6*32], %o5
425 movvs %xcc, %o4, $Bi
426 ldx [$pwrtbl+7*32], %o4
427 move %xcc, %o5, $Bi
428 add $pwrtbl,8*32, $pwrtbl
429 movneg %xcc, %o4, $Bi
68c06bf6
AP
430___
431}
432\f
433########################################################################
434# int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
4ddacd99 435# const u64 *pwrtbl,int pwr,int stride);
68c06bf6
AP
436#
437sub generate_bn_pwr5_mont_t4() {
438my $NUM=shift;
439my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
440
441$code.=<<___;
442.globl bn_pwr5_mont_t4_$NUM
443.align 32
444bn_pwr5_mont_t4_$NUM:
445#ifdef __arch64__
446 mov 0,$sentinel
447 mov -128,%g4
448#elif defined(SPARCV9_64BIT_STACK)
449 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
450 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
451 mov -2047,%g4
452 and %g1,SPARCV9_64BIT_STACK,%g1
453 movrz %g1,0,%g4
454 mov -1,$sentinel
455 add %g4,-128,%g4
456#else
457 mov -1,$sentinel
458 mov -128,%g4
459#endif
460 sllx $sentinel,32,$sentinel
461 save %sp,%g4,%sp
462#ifndef __arch64__
463 save %sp,-128,%sp ! warm it up
464 save %sp,-128,%sp
465 save %sp,-128,%sp
466 save %sp,-128,%sp
467 save %sp,-128,%sp
468 save %sp,-128,%sp
469 restore
470 restore
471 restore
472 restore
473 restore
474 restore
475#endif
476 and %sp,1,%g4
477 or $sentinel,%fp,%fp
478 or %g4,$sentinel,$sentinel
479
480 ! copy arguments to global registers
481 mov %i0,$tp
482 mov %i1,$np
483 ld [%i2+0],%f1 ! load *n0
484 ld [%i2+4],%f0
485 mov %i3,$pwrtbl
4ddacd99
AP
486 srl %i4,%g0,%i4 ! pack last arguments
487 sllx %i5,32,$pwr
488 or %i4,$pwr,$pwr
68c06bf6
AP
489 fsrc2 %f0,%f60
490___
491\f
492# load tp[$NUM] ########################################################
493$code.=<<___;
494 save %sp,-128,%sp; or $sentinel,%fp,%fp
495___
496for($i=0; $i<14 && $i<$NUM; $i++) {
497$code.=<<___;
498 ldx [$tp+$i*8],@A[$i]
499___
500}
501for(; $i<$NUM; $i++) {
502$code.=<<___;
503 ldd [$tp+$i*8],@A[$i]
504___
505}
506# load np[$NUM] ########################################################
507$code.=<<___;
508 save %sp,-128,%sp; or $sentinel,%fp,%fp
509___
510for($i=0; $i<14 && $i<$NUM; $i++) {
511$code.=<<___;
512 ldx [$np+$i*8],@N[$i]
513___
514}
515$code.=<<___;
516 save %sp,-128,%sp; or $sentinel,%fp,%fp
517___
518for(; $i<28 && $i<$NUM; $i++) {
519$code.=<<___;
520 ldx [$np+$i*8],@N[$i]
521___
522}
523$code.=<<___;
524 save %sp,-128,%sp; or $sentinel,%fp,%fp
525___
526for(; $i<$NUM; $i++) {
527$code.=<<___;
528 ldx [$np+$i*8],@N[$i]
529___
530}
531# load pwrtbl[pwr] ########################################################
68c06bf6
AP
532$code.=<<___;
533 save %sp,-128,%sp; or $sentinel,%fp,%fp
4ddacd99
AP
534
535 srlx $pwr, 32, %o4 ! unpack $pwr
536 srl $pwr, %g0, %o5
537 sub %o4, 5, %o4
538 mov $pwrtbl, %o7
539 sllx %o4, 32, $pwr ! re-pack $pwr
540 or %o5, $pwr, $pwr
541 srl %o5, %o4, %o5
68c06bf6 542___
4ddacd99 543 &load_ccr("%o7","%o5","%o4");
68c06bf6 544$code.=<<___;
4ddacd99
AP
545 b .Lstride_$NUM
546 nop
547.align 16
548.Lstride_$NUM:
68c06bf6 549___
4ddacd99
AP
550for($i=0; $i<14 && $i<$NUM; $i+=2) {
551 &load_b_pair("%o7",@B[$i],@B[$i+1]);
68c06bf6
AP
552}
553$code.=<<___;
554 save %sp,-128,%sp; or $sentinel,%fp,%fp
555___
4ddacd99
AP
556for(; $i<$NUM; $i+=2) {
557 &load_b_pair("%i7",@B[$i],@B[$i+1]);
558}
68c06bf6 559$code.=<<___;
4ddacd99
AP
560 srax $pwr, 32, %o4 ! unpack $pwr
561 srl $pwr, %g0, %o5
562 sub %o4, 5, %o4
563 mov $pwrtbl, %i7
564 sllx %o4, 32, $pwr ! re-pack $pwr
565 or %o5, $pwr, $pwr
566 srl %o5, %o4, %o5
68c06bf6 567___
4ddacd99 568 &load_ccr("%i7","%o5","%o4",1);
68c06bf6
AP
569\f
570# magic ################################################################
571for($i=0; $i<5; $i++) {
572$code.=<<___;
573 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
574 fbu,pn %fcc3,.Labort_$NUM
575#ifndef __arch64__
576 and %fp,$sentinel,$sentinel
577 brz,pn $sentinel,.Labort_$NUM
578#endif
579 nop
580___
581}
582$code.=<<___;
4ddacd99 583 wr %o4, %g0, %ccr
68c06bf6
AP
584 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
585 fbu,pn %fcc3,.Labort_$NUM
586#ifndef __arch64__
587 and %fp,$sentinel,$sentinel
588 brz,pn $sentinel,.Labort_$NUM
589#endif
68c06bf6 590
4ddacd99 591 srax $pwr, 32, %o4
68c06bf6 592#ifdef __arch64__
4ddacd99 593 brgez %o4,.Lstride_$NUM
68c06bf6
AP
594 restore
595 restore
596 restore
597 restore
598 restore
599#else
4ddacd99 600 brgez %o4,.Lstride_$NUM
68c06bf6
AP
601 restore; and %fp,$sentinel,$sentinel
602 restore; and %fp,$sentinel,$sentinel
603 restore; and %fp,$sentinel,$sentinel
604 restore; and %fp,$sentinel,$sentinel
605 brz,pn $sentinel,.Labort1_$NUM
606 restore
607#endif
608___
609\f
610# save tp[$NUM] ########################################################
611for($i=0; $i<14 && $i<$NUM; $i++) {
612$code.=<<___;
613 movxtod @A[$i],@R[$i]
614___
615}
616$code.=<<___;
617#ifdef __arch64__
618 restore
619#else
620 and %fp,$sentinel,$sentinel
621 restore
622 and $sentinel,1,%o7
623 and %fp,$sentinel,$sentinel
624 srl %fp,0,%fp ! just in case?
625 or %o7,$sentinel,$sentinel
626 brz,a,pn $sentinel,.Ldone_$NUM
627 mov 0,%i0 ! return failure
628#endif
629___
630for($i=0; $i<$NUM; $i++) {
631$code.=<<___;
632 std @R[$i],[$tp+$i*8]
633___
634}
635$code.=<<___;
636 mov 1,%i0 ! return success
637.Ldone_$NUM:
638 ret
639 restore
640
641.Labort_$NUM:
642 restore
643 restore
644 restore
645 restore
646 restore
647.Labort1_$NUM:
648 restore
649
650 mov 0,%i0 ! return failure
651 ret
652 restore
653.type bn_pwr5_mont_t4_$NUM, #function
654.size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
655___
656}
657
658for ($i=8;$i<=32;$i+=8) {
659 &generate_bn_pwr5_mont_t4($i);
660}
661\f
662{
663########################################################################
664# Fall-back subroutines
665#
666# copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
667#
668($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
669 (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
670
671# int bn_mul_mont(
672$rp="%o0"; # u64 *rp,
673$ap="%o1"; # const u64 *ap,
674$bp="%o2"; # const u64 *bp,
675$np="%o3"; # const u64 *np,
676$n0p="%o4"; # const BN_ULONG *n0,
677$num="%o5"; # int num); # caller ensures that num is >=3
678$code.=<<___;
679.globl bn_mul_mont_t4
680.align 32
681bn_mul_mont_t4:
682 add %sp, STACK_BIAS, %g4 ! real top of stack
683 sll $num, 3, $num ! size in bytes
684 add $num, 63, %g1
685 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
686 sub %g4, %g1, %g1
687 andn %g1, 63, %g1 ! align at 64 byte
688 sub %g1, STACK_FRAME, %g1 ! new top of stack
689 sub %g1, %g4, %g1
690
691 save %sp, %g1, %sp
692___
693# +-------------------------------+<----- %sp
694# . .
695# +-------------------------------+<----- aligned at 64 bytes
696# | __int64 tmp[0] |
697# +-------------------------------+
698# . .
699# . .
700# +-------------------------------+<----- aligned at 64 bytes
701# . .
702($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
703($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
704($ovf,$i)=($t0,$t1);
705$code.=<<___;
706 ld [$n0p+0], $t0 ! pull n0[0..1] value
707 ld [$n0p+4], $t1
708 add %sp, STACK_BIAS+STACK_FRAME, $tp
709 ldx [$bp+0], $m0 ! m0=bp[0]
710 sllx $t1, 32, $n0
711 add $bp, 8, $bp
712 or $t0, $n0, $n0
713\f
714 ldx [$ap+0], $aj ! ap[0]
715
716 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
717 umulxhi $aj, $m0, $hi0
718
719 ldx [$ap+8], $aj ! ap[1]
720 add $ap, 16, $ap
721 ldx [$np+0], $nj ! np[0]
722
723 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
724
725 mulx $aj, $m0, $alo ! ap[1]*bp[0]
726 umulxhi $aj, $m0, $aj ! ahi=aj
727
728 mulx $nj, $m1, $lo1 ! np[0]*m1
729 umulxhi $nj, $m1, $hi1
730
731 ldx [$np+8], $nj ! np[1]
732
733 addcc $lo0, $lo1, $lo1
734 add $np, 16, $np
735 addxc %g0, $hi1, $hi1
736
737 mulx $nj, $m1, $nlo ! np[1]*m1
738 umulxhi $nj, $m1, $nj ! nhi=nj
739\f
740 ba .L1st
741 sub $num, 24, $cnt ! cnt=num-3
742
743.align 16
744.L1st:
745 addcc $alo, $hi0, $lo0
746 addxc $aj, %g0, $hi0
747
748 ldx [$ap+0], $aj ! ap[j]
749 addcc $nlo, $hi1, $lo1
750 add $ap, 8, $ap
751 addxc $nj, %g0, $hi1 ! nhi=nj
752
753 ldx [$np+0], $nj ! np[j]
754 mulx $aj, $m0, $alo ! ap[j]*bp[0]
755 add $np, 8, $np
756 umulxhi $aj, $m0, $aj ! ahi=aj
757
758 mulx $nj, $m1, $nlo ! np[j]*m1
759 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
760 umulxhi $nj, $m1, $nj ! nhi=nj
761 addxc %g0, $hi1, $hi1
762 stxa $lo1, [$tp]0xe2 ! tp[j-1]
763 add $tp, 8, $tp ! tp++
764
765 brnz,pt $cnt, .L1st
766 sub $cnt, 8, $cnt ! j--
767!.L1st
768 addcc $alo, $hi0, $lo0
769 addxc $aj, %g0, $hi0 ! ahi=aj
770
771 addcc $nlo, $hi1, $lo1
772 addxc $nj, %g0, $hi1
773 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
774 addxc %g0, $hi1, $hi1
775 stxa $lo1, [$tp]0xe2 ! tp[j-1]
776 add $tp, 8, $tp
777
778 addcc $hi0, $hi1, $hi1
779 addxc %g0, %g0, $ovf ! upmost overflow bit
780 stxa $hi1, [$tp]0xe2
781 add $tp, 8, $tp
782\f
783 ba .Louter
784 sub $num, 16, $i ! i=num-2
785
786.align 16
787.Louter:
788 ldx [$bp+0], $m0 ! m0=bp[i]
789 add $bp, 8, $bp
790
791 sub $ap, $num, $ap ! rewind
792 sub $np, $num, $np
793 sub $tp, $num, $tp
794
795 ldx [$ap+0], $aj ! ap[0]
796 ldx [$np+0], $nj ! np[0]
797
798 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
799 ldx [$tp], $tj ! tp[0]
800 umulxhi $aj, $m0, $hi0
801 ldx [$ap+8], $aj ! ap[1]
802 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
803 mulx $aj, $m0, $alo ! ap[1]*bp[i]
804 addxc %g0, $hi0, $hi0
805 mulx $lo0, $n0, $m1 ! tp[0]*n0
806 umulxhi $aj, $m0, $aj ! ahi=aj
807 mulx $nj, $m1, $lo1 ! np[0]*m1
808 add $ap, 16, $ap
809 umulxhi $nj, $m1, $hi1
810 ldx [$np+8], $nj ! np[1]
811 add $np, 16, $np
812 addcc $lo1, $lo0, $lo1
813 mulx $nj, $m1, $nlo ! np[1]*m1
814 addxc %g0, $hi1, $hi1
815 umulxhi $nj, $m1, $nj ! nhi=nj
816\f
817 ba .Linner
818 sub $num, 24, $cnt ! cnt=num-3
819.align 16
820.Linner:
821 addcc $alo, $hi0, $lo0
822 ldx [$tp+8], $tj ! tp[j]
823 addxc $aj, %g0, $hi0 ! ahi=aj
824 ldx [$ap+0], $aj ! ap[j]
825 add $ap, 8, $ap
826 addcc $nlo, $hi1, $lo1
827 mulx $aj, $m0, $alo ! ap[j]*bp[i]
828 addxc $nj, %g0, $hi1 ! nhi=nj
829 ldx [$np+0], $nj ! np[j]
830 add $np, 8, $np
831 umulxhi $aj, $m0, $aj ! ahi=aj
832 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
833 mulx $nj, $m1, $nlo ! np[j]*m1
834 addxc %g0, $hi0, $hi0
835 umulxhi $nj, $m1, $nj ! nhi=nj
836 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
837 addxc %g0, $hi1, $hi1
838 stx $lo1, [$tp] ! tp[j-1]
839 add $tp, 8, $tp
840 brnz,pt $cnt, .Linner
841 sub $cnt, 8, $cnt
842!.Linner
843 ldx [$tp+8], $tj ! tp[j]
844 addcc $alo, $hi0, $lo0
845 addxc $aj, %g0, $hi0 ! ahi=aj
846 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
847 addxc %g0, $hi0, $hi0
848
849 addcc $nlo, $hi1, $lo1
850 addxc $nj, %g0, $hi1 ! nhi=nj
851 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
852 addxc %g0, $hi1, $hi1
853 stx $lo1, [$tp] ! tp[j-1]
854
855 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
856 addxccc $hi1, $hi0, $hi1
857 addxc %g0, %g0, $ovf
858 stx $hi1, [$tp+8]
859 add $tp, 16, $tp
860
861 brnz,pt $i, .Louter
862 sub $i, 8, $i
863\f
864 sub $ap, $num, $ap ! rewind
865 sub $np, $num, $np
866 sub $tp, $num, $tp
867 ba .Lsub
868 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
869
870.align 16
871.Lsub:
872 ldx [$tp], $tj
873 add $tp, 8, $tp
874 ldx [$np+0], $nj
875 add $np, 8, $np
876 subccc $tj, $nj, $t2 ! tp[j]-np[j]
877 srlx $tj, 32, $tj
878 srlx $nj, 32, $nj
879 subccc $tj, $nj, $t3
880 add $rp, 8, $rp
881 st $t2, [$rp-4] ! reverse order
882 st $t3, [$rp-8]
883 brnz,pt $cnt, .Lsub
884 sub $cnt, 8, $cnt
885
886 sub $np, $num, $np ! rewind
887 sub $tp, $num, $tp
888 sub $rp, $num, $rp
889
774ff8fe 890 subccc $ovf, %g0, $ovf ! handle upmost overflow bit
68c06bf6
AP
891 ba .Lcopy
892 sub $num, 8, $cnt
893
894.align 16
774ff8fe
AP
895.Lcopy: ! conditional copy
896 ldx [$tp], $tj
897 ldx [$rp+0], $t2
68c06bf6
AP
898 stx %g0, [$tp] ! zap
899 add $tp, 8, $tp
774ff8fe 900 movcs %icc, $tj, $t2
68c06bf6
AP
901 stx $t2, [$rp+0]
902 add $rp, 8, $rp
903 brnz $cnt, .Lcopy
904 sub $cnt, 8, $cnt
905
906 mov 1, %o0
907 ret
908 restore
909.type bn_mul_mont_t4, #function
910.size bn_mul_mont_t4, .-bn_mul_mont_t4
911___
912\f
913# int bn_mul_mont_gather5(
914$rp="%o0"; # u64 *rp,
915$ap="%o1"; # const u64 *ap,
916$bp="%o2"; # const u64 *pwrtbl,
917$np="%o3"; # const u64 *np,
918$n0p="%o4"; # const BN_ULONG *n0,
919$num="%o5"; # int num, # caller ensures that num is >=3
920 # int power);
921$code.=<<___;
922.globl bn_mul_mont_gather5_t4
923.align 32
924bn_mul_mont_gather5_t4:
925 add %sp, STACK_BIAS, %g4 ! real top of stack
926 sll $num, 3, $num ! size in bytes
927 add $num, 63, %g1
928 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
929 sub %g4, %g1, %g1
930 andn %g1, 63, %g1 ! align at 64 byte
931 sub %g1, STACK_FRAME, %g1 ! new top of stack
932 sub %g1, %g4, %g1
933 LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument
934
935 save %sp, %g1, %sp
936___
937# +-------------------------------+<----- %sp
938# . .
939# +-------------------------------+<----- aligned at 64 bytes
940# | __int64 tmp[0] |
941# +-------------------------------+
942# . .
943# . .
944# +-------------------------------+<----- aligned at 64 bytes
945# . .
946($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
4ddacd99 947($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
68c06bf6 948($ovf,$i)=($t0,$t1);
4ddacd99
AP
949 &load_ccr($bp,"%g4",$ccr);
950 &load_b($bp,$m0,"%o7"); # m0=bp[0]
68c06bf6 951
4ddacd99 952$code.=<<___;
68c06bf6
AP
953 ld [$n0p+0], $t0 ! pull n0[0..1] value
954 ld [$n0p+4], $t1
955 add %sp, STACK_BIAS+STACK_FRAME, $tp
956 sllx $t1, 32, $n0
957 or $t0, $n0, $n0
958\f
959 ldx [$ap+0], $aj ! ap[0]
960
961 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
962 umulxhi $aj, $m0, $hi0
963
964 ldx [$ap+8], $aj ! ap[1]
965 add $ap, 16, $ap
966 ldx [$np+0], $nj ! np[0]
967
968 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
969
970 mulx $aj, $m0, $alo ! ap[1]*bp[0]
971 umulxhi $aj, $m0, $aj ! ahi=aj
972
973 mulx $nj, $m1, $lo1 ! np[0]*m1
974 umulxhi $nj, $m1, $hi1
975
976 ldx [$np+8], $nj ! np[1]
977
978 addcc $lo0, $lo1, $lo1
979 add $np, 16, $np
980 addxc %g0, $hi1, $hi1
981
982 mulx $nj, $m1, $nlo ! np[1]*m1
983 umulxhi $nj, $m1, $nj ! nhi=nj
984\f
985 ba .L1st_g5
986 sub $num, 24, $cnt ! cnt=num-3
987
988.align 16
989.L1st_g5:
990 addcc $alo, $hi0, $lo0
991 addxc $aj, %g0, $hi0
992
993 ldx [$ap+0], $aj ! ap[j]
994 addcc $nlo, $hi1, $lo1
995 add $ap, 8, $ap
996 addxc $nj, %g0, $hi1 ! nhi=nj
997
998 ldx [$np+0], $nj ! np[j]
999 mulx $aj, $m0, $alo ! ap[j]*bp[0]
1000 add $np, 8, $np
1001 umulxhi $aj, $m0, $aj ! ahi=aj
1002
1003 mulx $nj, $m1, $nlo ! np[j]*m1
1004 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
1005 umulxhi $nj, $m1, $nj ! nhi=nj
1006 addxc %g0, $hi1, $hi1
1007 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1008 add $tp, 8, $tp ! tp++
1009
1010 brnz,pt $cnt, .L1st_g5
1011 sub $cnt, 8, $cnt ! j--
1012!.L1st_g5
1013 addcc $alo, $hi0, $lo0
1014 addxc $aj, %g0, $hi0 ! ahi=aj
1015
1016 addcc $nlo, $hi1, $lo1
1017 addxc $nj, %g0, $hi1
1018 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
1019 addxc %g0, $hi1, $hi1
1020 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1021 add $tp, 8, $tp
1022
1023 addcc $hi0, $hi1, $hi1
1024 addxc %g0, %g0, $ovf ! upmost overflow bit
1025 stxa $hi1, [$tp]0xe2
1026 add $tp, 8, $tp
1027\f
1028 ba .Louter_g5
1029 sub $num, 16, $i ! i=num-2
1030
1031.align 16
1032.Louter_g5:
4ddacd99 1033 wr $ccr, %g0, %ccr
68c06bf6 1034___
4ddacd99 1035 &load_b($bp,$m0); # m0=bp[i]
68c06bf6 1036$code.=<<___;
68c06bf6
AP
1037 sub $ap, $num, $ap ! rewind
1038 sub $np, $num, $np
1039 sub $tp, $num, $tp
1040
1041 ldx [$ap+0], $aj ! ap[0]
1042 ldx [$np+0], $nj ! np[0]
1043
1044 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
1045 ldx [$tp], $tj ! tp[0]
1046 umulxhi $aj, $m0, $hi0
1047 ldx [$ap+8], $aj ! ap[1]
1048 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
1049 mulx $aj, $m0, $alo ! ap[1]*bp[i]
1050 addxc %g0, $hi0, $hi0
1051 mulx $lo0, $n0, $m1 ! tp[0]*n0
1052 umulxhi $aj, $m0, $aj ! ahi=aj
1053 mulx $nj, $m1, $lo1 ! np[0]*m1
1054 add $ap, 16, $ap
1055 umulxhi $nj, $m1, $hi1
1056 ldx [$np+8], $nj ! np[1]
1057 add $np, 16, $np
1058 addcc $lo1, $lo0, $lo1
1059 mulx $nj, $m1, $nlo ! np[1]*m1
1060 addxc %g0, $hi1, $hi1
1061 umulxhi $nj, $m1, $nj ! nhi=nj
1062\f
1063 ba .Linner_g5
1064 sub $num, 24, $cnt ! cnt=num-3
1065.align 16
1066.Linner_g5:
1067 addcc $alo, $hi0, $lo0
1068 ldx [$tp+8], $tj ! tp[j]
1069 addxc $aj, %g0, $hi0 ! ahi=aj
1070 ldx [$ap+0], $aj ! ap[j]
1071 add $ap, 8, $ap
1072 addcc $nlo, $hi1, $lo1
1073 mulx $aj, $m0, $alo ! ap[j]*bp[i]
1074 addxc $nj, %g0, $hi1 ! nhi=nj
1075 ldx [$np+0], $nj ! np[j]
1076 add $np, 8, $np
1077 umulxhi $aj, $m0, $aj ! ahi=aj
1078 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1079 mulx $nj, $m1, $nlo ! np[j]*m1
1080 addxc %g0, $hi0, $hi0
1081 umulxhi $nj, $m1, $nj ! nhi=nj
1082 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1083 addxc %g0, $hi1, $hi1
1084 stx $lo1, [$tp] ! tp[j-1]
1085 add $tp, 8, $tp
1086 brnz,pt $cnt, .Linner_g5
1087 sub $cnt, 8, $cnt
1088!.Linner_g5
1089 ldx [$tp+8], $tj ! tp[j]
1090 addcc $alo, $hi0, $lo0
1091 addxc $aj, %g0, $hi0 ! ahi=aj
1092 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1093 addxc %g0, $hi0, $hi0
1094
1095 addcc $nlo, $hi1, $lo1
1096 addxc $nj, %g0, $hi1 ! nhi=nj
1097 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1098 addxc %g0, $hi1, $hi1
1099 stx $lo1, [$tp] ! tp[j-1]
1100
1101 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
1102 addxccc $hi1, $hi0, $hi1
1103 addxc %g0, %g0, $ovf
1104 stx $hi1, [$tp+8]
1105 add $tp, 16, $tp
1106
1107 brnz,pt $i, .Louter_g5
1108 sub $i, 8, $i
1109\f
1110 sub $ap, $num, $ap ! rewind
1111 sub $np, $num, $np
1112 sub $tp, $num, $tp
1113 ba .Lsub_g5
1114 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
1115
1116.align 16
1117.Lsub_g5:
1118 ldx [$tp], $tj
1119 add $tp, 8, $tp
1120 ldx [$np+0], $nj
1121 add $np, 8, $np
1122 subccc $tj, $nj, $t2 ! tp[j]-np[j]
1123 srlx $tj, 32, $tj
1124 srlx $nj, 32, $nj
1125 subccc $tj, $nj, $t3
1126 add $rp, 8, $rp
1127 st $t2, [$rp-4] ! reverse order
1128 st $t3, [$rp-8]
1129 brnz,pt $cnt, .Lsub_g5
1130 sub $cnt, 8, $cnt
1131
1132 sub $np, $num, $np ! rewind
1133 sub $tp, $num, $tp
1134 sub $rp, $num, $rp
1135
774ff8fe 1136 subccc $ovf, %g0, $ovf ! handle upmost overflow bit
68c06bf6
AP
1137 ba .Lcopy_g5
1138 sub $num, 8, $cnt
1139
1140.align 16
774ff8fe
AP
1141.Lcopy_g5: ! conditional copy
1142 ldx [$tp], $tj
1143 ldx [$rp+0], $t2
68c06bf6
AP
1144 stx %g0, [$tp] ! zap
1145 add $tp, 8, $tp
774ff8fe 1146 movcs %icc, $tj, $t2
68c06bf6
AP
1147 stx $t2, [$rp+0]
1148 add $rp, 8, $rp
1149 brnz $cnt, .Lcopy_g5
1150 sub $cnt, 8, $cnt
1151
1152 mov 1, %o0
1153 ret
1154 restore
1155.type bn_mul_mont_gather5_t4, #function
1156.size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
1157___
1158}
1159\f
1160$code.=<<___;
1161.globl bn_flip_t4
1162.align 32
1163bn_flip_t4:
1164.Loop_flip:
1165 ld [%o1+0], %o4
1166 sub %o2, 1, %o2
1167 ld [%o1+4], %o5
1168 add %o1, 8, %o1
1169 st %o5, [%o0+0]
1170 st %o4, [%o0+4]
1171 brnz %o2, .Loop_flip
1172 add %o0, 8, %o0
1173 retl
1174 nop
1175.type bn_flip_t4, #function
1176.size bn_flip_t4, .-bn_flip_t4
1177
4ddacd99 1178.globl bn_flip_n_scatter5_t4
68c06bf6 1179.align 32
4ddacd99 1180bn_flip_n_scatter5_t4:
68c06bf6 1181 sll %o3, 3, %o3
4ddacd99 1182 srl %o1, 1, %o1
68c06bf6 1183 add %o3, %o2, %o2 ! &pwrtbl[pwr]
4ddacd99
AP
1184 sub %o1, 1, %o1
1185.Loop_flip_n_scatter5:
1186 ld [%o0+0], %o4 ! inp[i]
1187 ld [%o0+4], %o5
68c06bf6 1188 add %o0, 8, %o0
4ddacd99
AP
1189 sllx %o5, 32, %o5
1190 or %o4, %o5, %o5
1191 stx %o5, [%o2]
68c06bf6 1192 add %o2, 32*8, %o2
4ddacd99 1193 brnz %o1, .Loop_flip_n_scatter5
68c06bf6
AP
1194 sub %o1, 1, %o1
1195 retl
1196 nop
4ddacd99
AP
1197.type bn_flip_n_scatter5_t4, #function
1198.size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
68c06bf6
AP
1199
1200.globl bn_gather5_t4
1201.align 32
1202bn_gather5_t4:
68c06bf6 1203___
4ddacd99 1204 &load_ccr("%o2","%o3","%g1");
68c06bf6 1205$code.=<<___;
68c06bf6
AP
1206 sub %o1, 1, %o1
1207.Loop_gather5:
1208___
4ddacd99 1209 &load_b("%o2","%g1");
68c06bf6 1210$code.=<<___;
4ddacd99 1211 stx %g1, [%o0]
68c06bf6
AP
1212 add %o0, 8, %o0
1213 brnz %o1, .Loop_gather5
1214 sub %o1, 1, %o1
1215
1216 retl
1217 nop
1218.type bn_gather5_t4, #function
1219.size bn_gather5_t4, .-bn_gather5_t4
4ddacd99 1220
68c06bf6
AP
1221.asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
1222.align 4
1223___
1224
1225&emit_assembler();
1226
a21314db 1227close STDOUT or die "error closing STDOUT: $!";