]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
83cf7abf | 2 | # Copyright 2012-2018 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
367ace68 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
68c06bf6 AP |
9 | |
10 | # ==================================================================== | |
e3713c36 RS |
11 | # Written by David S. Miller and Andy Polyakov |
12 | # The module is licensed under 2-clause BSD license. | |
13 | # November 2012. All rights reserved. | |
68c06bf6 AP |
14 | # ==================================================================== |
15 | ||
16 | ###################################################################### | |
17 | # Montgomery squaring-n-multiplication module for SPARC T4. | |
18 | # | |
19 | # The module consists of three parts: | |
20 | # | |
21 | # 1) collection of "single-op" subroutines that perform single | |
22 | # operation, Montgomery squaring or multiplication, on 512-, | |
23 | # 1024-, 1536- and 2048-bit operands; | |
24 | # 2) collection of "multi-op" subroutines that perform 5 squaring and | |
25 | # 1 multiplication operations on operands of above lengths; | |
26 | # 3) fall-back and helper VIS3 subroutines. | |
27 | # | |
28 | # RSA sign is dominated by multi-op subroutine, while RSA verify and | |
29 | # DSA - by single-op. Special note about 4096-bit RSA verify result. | |
30 | # Operands are too long for dedicated hardware and it's handled by | |
31 | # VIS3 code, which is why you don't see any improvement. It's surely | |
32 | # possible to improve it [by deploying 'mpmul' instruction], maybe in | |
33 | # the future... | |
34 | # | |
35 | # Performance improvement. | |
36 | # | |
37 | # 64-bit process, VIS3: | |
38 | # sign verify sign/s verify/s | |
4ddacd99 AP |
39 | # rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4 |
40 | # rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3 | |
41 | # rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9 | |
68c06bf6 AP |
42 | # dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9 |
43 | # dsa 2048 bits 0.001056s 0.001233s 946.9 810.8 | |
44 | # | |
45 | # 64-bit process, this module: | |
46 | # sign verify sign/s verify/s | |
4ddacd99 AP |
47 | # rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9 |
48 | # rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7 | |
49 | # rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5 | |
50 | # dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5 | |
51 | # dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6 | |
68c06bf6 AP |
52 | # |
53 | ###################################################################### | |
54 | # 32-bit process, VIS3: | |
55 | # sign verify sign/s verify/s | |
4ddacd99 AP |
56 | # rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3 |
57 | # rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4 | |
58 | # rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8 | |
59 | # dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6 | |
60 | # dsa 2048 bits 0.001101s 0.001260s 908.2 793.4 | |
68c06bf6 AP |
61 | # |
62 | # 32-bit process, this module: | |
63 | # sign verify sign/s verify/s | |
4ddacd99 AP |
64 | # rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0 |
65 | # rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7 | |
66 | # rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4 | |
67 | # dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2 | |
68 | # dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2 | |
68c06bf6 AP |
69 | # |
70 | # 32-bit code is prone to performance degradation as interrupt rate | |
71 | # dispatched to CPU executing the code grows. This is because in | |
72 | # standard process of handling interrupt in 32-bit process context | |
73 | # upper halves of most integer registers used as input or output are | |
74 | # zeroed. This renders result invalid, and operation has to be re-run. | |
75 | # If CPU is "bothered" with timer interrupts only, the penalty is | |
76 | # hardly measurable. But in order to mitigate this problem for higher | |
77 | # interrupt rates contemporary Linux kernel recognizes biased stack | |
78 | # even in 32-bit process context and preserves full register contents. | |
4ddacd99 | 79 | # See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb |
68c06bf6 AP |
80 | # for details. |
81 | ||
82 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
83 | push(@INC,"${dir}","${dir}../../perlasm"); | |
84 | require "sparcv9_modes.pl"; | |
85 | ||
1aa89a7a | 86 | $output = pop and open STDOUT,">$output"; |
6bd7a4d9 | 87 | |
68c06bf6 AP |
88 | $code.=<<___; |
89 | #include "sparc_arch.h" | |
90 | ||
91 | #ifdef __arch64__ | |
92 | .register %g2,#scratch | |
93 | .register %g3,#scratch | |
94 | #endif | |
95 | ||
96 | .section ".text",#alloc,#execinstr | |
97 | ||
98 | #ifdef __PIC__ | |
99 | SPARC_PIC_THUNK(%g1) | |
100 | #endif | |
101 | ___ | |
102 | ||
103 | ######################################################################## | |
104 | # Register layout for mont[mul|sqr] instructions. | |
105 | # For details see "Oracle SPARC Architecture 2011" manual at | |
106 | # http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/. | |
107 | # | |
108 | my @R=map("%f".2*$_,(0..11,30,31,12..29)); | |
109 | my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]); | |
68c06bf6 | 110 | my @A=(@N[0..13],@R[14..31]); |
4ddacd99 | 111 | my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3))); |
68c06bf6 AP |
112 | \f |
113 | ######################################################################## | |
114 | # int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp, | |
115 | # const u64 *np,const BN_ULONG *n0); | |
116 | # | |
117 | sub generate_bn_mul_mont_t4() { | |
118 | my $NUM=shift; | |
119 | my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5)); | |
120 | ||
121 | $code.=<<___; | |
122 | .globl bn_mul_mont_t4_$NUM | |
123 | .align 32 | |
124 | bn_mul_mont_t4_$NUM: | |
125 | #ifdef __arch64__ | |
126 | mov 0,$sentinel | |
127 | mov -128,%g4 | |
128 | #elif defined(SPARCV9_64BIT_STACK) | |
129 | SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) | |
130 | ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0] | |
131 | mov -2047,%g4 | |
132 | and %g1,SPARCV9_64BIT_STACK,%g1 | |
133 | movrz %g1,0,%g4 | |
134 | mov -1,$sentinel | |
135 | add %g4,-128,%g4 | |
136 | #else | |
137 | mov -1,$sentinel | |
138 | mov -128,%g4 | |
139 | #endif | |
140 | sllx $sentinel,32,$sentinel | |
141 | save %sp,%g4,%sp | |
142 | #ifndef __arch64__ | |
143 | save %sp,-128,%sp ! warm it up | |
144 | save %sp,-128,%sp | |
145 | save %sp,-128,%sp | |
146 | save %sp,-128,%sp | |
147 | save %sp,-128,%sp | |
148 | save %sp,-128,%sp | |
149 | restore | |
150 | restore | |
151 | restore | |
152 | restore | |
153 | restore | |
154 | restore | |
155 | #endif | |
156 | and %sp,1,%g4 | |
157 | or $sentinel,%fp,%fp | |
158 | or %g4,$sentinel,$sentinel | |
159 | ||
160 | ! copy arguments to global registers | |
161 | mov %i0,$rp | |
162 | mov %i1,$ap | |
163 | mov %i2,$bp | |
164 | mov %i3,$np | |
165 | ld [%i4+0],%f1 ! load *n0 | |
166 | ld [%i4+4],%f0 | |
167 | fsrc2 %f0,%f60 | |
168 | ___ | |
169 | \f | |
170 | # load ap[$NUM] ######################################################## | |
171 | $code.=<<___; | |
172 | save %sp,-128,%sp; or $sentinel,%fp,%fp | |
173 | ___ | |
174 | for($i=0; $i<14 && $i<$NUM; $i++) { | |
175 | my $lo=$i<13?@A[$i+1]:"%o7"; | |
176 | $code.=<<___; | |
177 | ld [$ap+$i*8+0],$lo | |
178 | ld [$ap+$i*8+4],@A[$i] | |
179 | sllx @A[$i],32,@A[$i] | |
180 | or $lo,@A[$i],@A[$i] | |
181 | ___ | |
182 | } | |
183 | for(; $i<$NUM; $i++) { | |
184 | my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1)); | |
185 | $code.=<<___; | |
186 | ld [$ap+$i*8+0],$lo | |
187 | ld [$ap+$i*8+4],$hi | |
188 | fsrc2 $hi,@A[$i] | |
189 | ___ | |
190 | } | |
191 | # load np[$NUM] ######################################################## | |
192 | $code.=<<___; | |
193 | save %sp,-128,%sp; or $sentinel,%fp,%fp | |
194 | ___ | |
195 | for($i=0; $i<14 && $i<$NUM; $i++) { | |
196 | my $lo=$i<13?@N[$i+1]:"%o7"; | |
197 | $code.=<<___; | |
198 | ld [$np+$i*8+0],$lo | |
199 | ld [$np+$i*8+4],@N[$i] | |
200 | sllx @N[$i],32,@N[$i] | |
201 | or $lo,@N[$i],@N[$i] | |
202 | ___ | |
203 | } | |
204 | $code.=<<___; | |
205 | save %sp,-128,%sp; or $sentinel,%fp,%fp | |
206 | ___ | |
207 | for(; $i<28 && $i<$NUM; $i++) { | |
208 | my $lo=$i<27?@N[$i+1]:"%o7"; | |
209 | $code.=<<___; | |
210 | ld [$np+$i*8+0],$lo | |
211 | ld [$np+$i*8+4],@N[$i] | |
212 | sllx @N[$i],32,@N[$i] | |
213 | or $lo,@N[$i],@N[$i] | |
214 | ___ | |
215 | } | |
216 | $code.=<<___; | |
217 | save %sp,-128,%sp; or $sentinel,%fp,%fp | |
218 | ___ | |
219 | for(; $i<$NUM; $i++) { | |
220 | my $lo=($i<$NUM-1)?@N[$i+1]:"%o7"; | |
221 | $code.=<<___; | |
222 | ld [$np+$i*8+0],$lo | |
223 | ld [$np+$i*8+4],@N[$i] | |
224 | sllx @N[$i],32,@N[$i] | |
225 | or $lo,@N[$i],@N[$i] | |
226 | ___ | |
227 | } | |
228 | $code.=<<___; | |
229 | cmp $ap,$bp | |
230 | be SIZE_T_CC,.Lmsquare_$NUM | |
231 | nop | |
232 | ___ | |
233 | \f | |
234 | # load bp[$NUM] ######################################################## | |
68c06bf6 AP |
235 | $code.=<<___; |
236 | save %sp,-128,%sp; or $sentinel,%fp,%fp | |
237 | ___ | |
4ddacd99 AP |
238 | for($i=0; $i<14 && $i<$NUM; $i++) { |
239 | my $lo=$i<13?@B[$i+1]:"%o7"; | |
68c06bf6 AP |
240 | $code.=<<___; |
241 | ld [$bp+$i*8+0],$lo | |
242 | ld [$bp+$i*8+4],@B[$i] | |
243 | sllx @B[$i],32,@B[$i] | |
244 | or $lo,@B[$i],@B[$i] | |
245 | ___ | |
246 | } | |
247 | $code.=<<___; | |
248 | save %sp,-128,%sp; or $sentinel,%fp,%fp | |
249 | ___ | |
250 | for(; $i<$NUM; $i++) { | |
251 | my $lo=($i<$NUM-1)?@B[$i+1]:"%o7"; | |
252 | $code.=<<___; | |
253 | ld [$bp+$i*8+0],$lo | |
254 | ld [$bp+$i*8+4],@B[$i] | |
255 | sllx @B[$i],32,@B[$i] | |
256 | or $lo,@B[$i],@B[$i] | |
257 | ___ | |
258 | } | |
259 | # magic ################################################################ | |
260 | $code.=<<___; | |
261 | .word 0x81b02920+$NUM-1 ! montmul $NUM-1 | |
262 | .Lmresume_$NUM: | |
263 | fbu,pn %fcc3,.Lmabort_$NUM | |
264 | #ifndef __arch64__ | |
265 | and %fp,$sentinel,$sentinel | |
266 | brz,pn $sentinel,.Lmabort_$NUM | |
267 | #endif | |
268 | nop | |
269 | #ifdef __arch64__ | |
270 | restore | |
271 | restore | |
272 | restore | |
273 | restore | |
274 | restore | |
275 | #else | |
276 | restore; and %fp,$sentinel,$sentinel | |
277 | restore; and %fp,$sentinel,$sentinel | |
278 | restore; and %fp,$sentinel,$sentinel | |
279 | restore; and %fp,$sentinel,$sentinel | |
280 | brz,pn $sentinel,.Lmabort1_$NUM | |
281 | restore | |
282 | #endif | |
283 | ___ | |
284 | \f | |
285 | # save tp[$NUM] ######################################################## | |
286 | for($i=0; $i<14 && $i<$NUM; $i++) { | |
287 | $code.=<<___; | |
288 | movxtod @A[$i],@R[$i] | |
289 | ___ | |
290 | } | |
291 | $code.=<<___; | |
292 | #ifdef __arch64__ | |
293 | restore | |
294 | #else | |
295 | and %fp,$sentinel,$sentinel | |
296 | restore | |
297 | and $sentinel,1,%o7 | |
298 | and %fp,$sentinel,$sentinel | |
299 | srl %fp,0,%fp ! just in case? | |
300 | or %o7,$sentinel,$sentinel | |
301 | brz,a,pn $sentinel,.Lmdone_$NUM | |
302 | mov 0,%i0 ! return failure | |
303 | #endif | |
304 | ___ | |
305 | for($i=0; $i<12 && $i<$NUM; $i++) { | |
306 | @R[$i] =~ /%f([0-9]+)/; | |
307 | my $lo = "%f".($1+1); | |
308 | $code.=<<___; | |
309 | st $lo,[$rp+$i*8+0] | |
310 | st @R[$i],[$rp+$i*8+4] | |
311 | ___ | |
312 | } | |
313 | for(; $i<$NUM; $i++) { | |
314 | my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1)); | |
315 | $code.=<<___; | |
316 | fsrc2 @R[$i],$hi | |
317 | st $lo,[$rp+$i*8+0] | |
318 | st $hi,[$rp+$i*8+4] | |
319 | ___ | |
320 | } | |
321 | $code.=<<___; | |
322 | mov 1,%i0 ! return success | |
323 | .Lmdone_$NUM: | |
324 | ret | |
325 | restore | |
326 | ||
327 | .Lmabort_$NUM: | |
328 | restore | |
329 | restore | |
330 | restore | |
331 | restore | |
332 | restore | |
333 | .Lmabort1_$NUM: | |
334 | restore | |
335 | ||
336 | mov 0,%i0 ! return failure | |
337 | ret | |
338 | restore | |
339 | ||
340 | .align 32 | |
341 | .Lmsquare_$NUM: | |
342 | save %sp,-128,%sp; or $sentinel,%fp,%fp | |
343 | save %sp,-128,%sp; or $sentinel,%fp,%fp | |
344 | .word 0x81b02940+$NUM-1 ! montsqr $NUM-1 | |
345 | ba .Lmresume_$NUM | |
346 | nop | |
347 | .type bn_mul_mont_t4_$NUM, #function | |
348 | .size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM | |
349 | ___ | |
350 | } | |
351 | ||
352 | for ($i=8;$i<=32;$i+=8) { | |
353 | &generate_bn_mul_mont_t4($i); | |
354 | } | |
355 | \f | |
356 | ######################################################################## | |
357 | # | |
4ddacd99 AP |
358 | sub load_ccr { |
359 | my ($ptbl,$pwr,$ccr,$skip_wr)=@_; | |
360 | $code.=<<___; | |
361 | srl $pwr, 2, %o4 | |
362 | and $pwr, 3, %o5 | |
363 | and %o4, 7, %o4 | |
364 | sll %o5, 3, %o5 ! offset within first cache line | |
365 | add %o5, $ptbl, $ptbl ! of the pwrtbl | |
366 | or %g0, 1, %o5 | |
367 | sll %o5, %o4, $ccr | |
368 | ___ | |
369 | $code.=<<___ if (!$skip_wr); | |
370 | wr $ccr, %g0, %ccr | |
371 | ___ | |
372 | } | |
373 | sub load_b_pair { | |
374 | my ($pwrtbl,$B0,$B1)=@_; | |
375 | ||
68c06bf6 | 376 | $code.=<<___; |
4ddacd99 AP |
377 | ldx [$pwrtbl+0*32], $B0 |
378 | ldx [$pwrtbl+8*32], $B1 | |
379 | ldx [$pwrtbl+1*32], %o4 | |
380 | ldx [$pwrtbl+9*32], %o5 | |
381 | movvs %icc, %o4, $B0 | |
382 | ldx [$pwrtbl+2*32], %o4 | |
383 | movvs %icc, %o5, $B1 | |
384 | ldx [$pwrtbl+10*32],%o5 | |
385 | move %icc, %o4, $B0 | |
386 | ldx [$pwrtbl+3*32], %o4 | |
387 | move %icc, %o5, $B1 | |
388 | ldx [$pwrtbl+11*32],%o5 | |
389 | movneg %icc, %o4, $B0 | |
390 | ldx [$pwrtbl+4*32], %o4 | |
391 | movneg %icc, %o5, $B1 | |
392 | ldx [$pwrtbl+12*32],%o5 | |
393 | movcs %xcc, %o4, $B0 | |
394 | ldx [$pwrtbl+5*32],%o4 | |
395 | movcs %xcc, %o5, $B1 | |
396 | ldx [$pwrtbl+13*32],%o5 | |
397 | movvs %xcc, %o4, $B0 | |
398 | ldx [$pwrtbl+6*32], %o4 | |
399 | movvs %xcc, %o5, $B1 | |
400 | ldx [$pwrtbl+14*32],%o5 | |
401 | move %xcc, %o4, $B0 | |
402 | ldx [$pwrtbl+7*32], %o4 | |
403 | move %xcc, %o5, $B1 | |
404 | ldx [$pwrtbl+15*32],%o5 | |
405 | movneg %xcc, %o4, $B0 | |
406 | add $pwrtbl,16*32, $pwrtbl | |
407 | movneg %xcc, %o5, $B1 | |
68c06bf6 AP |
408 | ___ |
409 | } | |
4ddacd99 AP |
410 | sub load_b { |
411 | my ($pwrtbl,$Bi)=@_; | |
412 | ||
68c06bf6 | 413 | $code.=<<___; |
4ddacd99 AP |
414 | ldx [$pwrtbl+0*32], $Bi |
415 | ldx [$pwrtbl+1*32], %o4 | |
416 | ldx [$pwrtbl+2*32], %o5 | |
417 | movvs %icc, %o4, $Bi | |
418 | ldx [$pwrtbl+3*32], %o4 | |
419 | move %icc, %o5, $Bi | |
420 | ldx [$pwrtbl+4*32], %o5 | |
421 | movneg %icc, %o4, $Bi | |
422 | ldx [$pwrtbl+5*32], %o4 | |
423 | movcs %xcc, %o5, $Bi | |
424 | ldx [$pwrtbl+6*32], %o5 | |
425 | movvs %xcc, %o4, $Bi | |
426 | ldx [$pwrtbl+7*32], %o4 | |
427 | move %xcc, %o5, $Bi | |
428 | add $pwrtbl,8*32, $pwrtbl | |
429 | movneg %xcc, %o4, $Bi | |
68c06bf6 AP |
430 | ___ |
431 | } | |
432 | \f | |
433 | ######################################################################## | |
434 | # int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0, | |
4ddacd99 | 435 | # const u64 *pwrtbl,int pwr,int stride); |
68c06bf6 AP |
436 | # |
437 | sub generate_bn_pwr5_mont_t4() { | |
438 | my $NUM=shift; | |
439 | my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5)); | |
440 | ||
441 | $code.=<<___; | |
442 | .globl bn_pwr5_mont_t4_$NUM | |
443 | .align 32 | |
444 | bn_pwr5_mont_t4_$NUM: | |
445 | #ifdef __arch64__ | |
446 | mov 0,$sentinel | |
447 | mov -128,%g4 | |
448 | #elif defined(SPARCV9_64BIT_STACK) | |
449 | SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) | |
450 | ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0] | |
451 | mov -2047,%g4 | |
452 | and %g1,SPARCV9_64BIT_STACK,%g1 | |
453 | movrz %g1,0,%g4 | |
454 | mov -1,$sentinel | |
455 | add %g4,-128,%g4 | |
456 | #else | |
457 | mov -1,$sentinel | |
458 | mov -128,%g4 | |
459 | #endif | |
460 | sllx $sentinel,32,$sentinel | |
461 | save %sp,%g4,%sp | |
462 | #ifndef __arch64__ | |
463 | save %sp,-128,%sp ! warm it up | |
464 | save %sp,-128,%sp | |
465 | save %sp,-128,%sp | |
466 | save %sp,-128,%sp | |
467 | save %sp,-128,%sp | |
468 | save %sp,-128,%sp | |
469 | restore | |
470 | restore | |
471 | restore | |
472 | restore | |
473 | restore | |
474 | restore | |
475 | #endif | |
476 | and %sp,1,%g4 | |
477 | or $sentinel,%fp,%fp | |
478 | or %g4,$sentinel,$sentinel | |
479 | ||
480 | ! copy arguments to global registers | |
481 | mov %i0,$tp | |
482 | mov %i1,$np | |
483 | ld [%i2+0],%f1 ! load *n0 | |
484 | ld [%i2+4],%f0 | |
485 | mov %i3,$pwrtbl | |
4ddacd99 AP |
486 | srl %i4,%g0,%i4 ! pack last arguments |
487 | sllx %i5,32,$pwr | |
488 | or %i4,$pwr,$pwr | |
68c06bf6 AP |
489 | fsrc2 %f0,%f60 |
490 | ___ | |
491 | \f | |
492 | # load tp[$NUM] ######################################################## | |
493 | $code.=<<___; | |
494 | save %sp,-128,%sp; or $sentinel,%fp,%fp | |
495 | ___ | |
496 | for($i=0; $i<14 && $i<$NUM; $i++) { | |
497 | $code.=<<___; | |
498 | ldx [$tp+$i*8],@A[$i] | |
499 | ___ | |
500 | } | |
501 | for(; $i<$NUM; $i++) { | |
502 | $code.=<<___; | |
503 | ldd [$tp+$i*8],@A[$i] | |
504 | ___ | |
505 | } | |
506 | # load np[$NUM] ######################################################## | |
507 | $code.=<<___; | |
508 | save %sp,-128,%sp; or $sentinel,%fp,%fp | |
509 | ___ | |
510 | for($i=0; $i<14 && $i<$NUM; $i++) { | |
511 | $code.=<<___; | |
512 | ldx [$np+$i*8],@N[$i] | |
513 | ___ | |
514 | } | |
515 | $code.=<<___; | |
516 | save %sp,-128,%sp; or $sentinel,%fp,%fp | |
517 | ___ | |
518 | for(; $i<28 && $i<$NUM; $i++) { | |
519 | $code.=<<___; | |
520 | ldx [$np+$i*8],@N[$i] | |
521 | ___ | |
522 | } | |
523 | $code.=<<___; | |
524 | save %sp,-128,%sp; or $sentinel,%fp,%fp | |
525 | ___ | |
526 | for(; $i<$NUM; $i++) { | |
527 | $code.=<<___; | |
528 | ldx [$np+$i*8],@N[$i] | |
529 | ___ | |
530 | } | |
531 | # load pwrtbl[pwr] ######################################################## | |
68c06bf6 AP |
532 | $code.=<<___; |
533 | save %sp,-128,%sp; or $sentinel,%fp,%fp | |
4ddacd99 AP |
534 | |
535 | srlx $pwr, 32, %o4 ! unpack $pwr | |
536 | srl $pwr, %g0, %o5 | |
537 | sub %o4, 5, %o4 | |
538 | mov $pwrtbl, %o7 | |
539 | sllx %o4, 32, $pwr ! re-pack $pwr | |
540 | or %o5, $pwr, $pwr | |
541 | srl %o5, %o4, %o5 | |
68c06bf6 | 542 | ___ |
4ddacd99 | 543 | &load_ccr("%o7","%o5","%o4"); |
68c06bf6 | 544 | $code.=<<___; |
4ddacd99 AP |
545 | b .Lstride_$NUM |
546 | nop | |
547 | .align 16 | |
548 | .Lstride_$NUM: | |
68c06bf6 | 549 | ___ |
4ddacd99 AP |
550 | for($i=0; $i<14 && $i<$NUM; $i+=2) { |
551 | &load_b_pair("%o7",@B[$i],@B[$i+1]); | |
68c06bf6 AP |
552 | } |
553 | $code.=<<___; | |
554 | save %sp,-128,%sp; or $sentinel,%fp,%fp | |
555 | ___ | |
4ddacd99 AP |
556 | for(; $i<$NUM; $i+=2) { |
557 | &load_b_pair("%i7",@B[$i],@B[$i+1]); | |
558 | } | |
68c06bf6 | 559 | $code.=<<___; |
4ddacd99 AP |
560 | srax $pwr, 32, %o4 ! unpack $pwr |
561 | srl $pwr, %g0, %o5 | |
562 | sub %o4, 5, %o4 | |
563 | mov $pwrtbl, %i7 | |
564 | sllx %o4, 32, $pwr ! re-pack $pwr | |
565 | or %o5, $pwr, $pwr | |
566 | srl %o5, %o4, %o5 | |
68c06bf6 | 567 | ___ |
4ddacd99 | 568 | &load_ccr("%i7","%o5","%o4",1); |
68c06bf6 AP |
569 | \f |
570 | # magic ################################################################ | |
571 | for($i=0; $i<5; $i++) { | |
572 | $code.=<<___; | |
573 | .word 0x81b02940+$NUM-1 ! montsqr $NUM-1 | |
574 | fbu,pn %fcc3,.Labort_$NUM | |
575 | #ifndef __arch64__ | |
576 | and %fp,$sentinel,$sentinel | |
577 | brz,pn $sentinel,.Labort_$NUM | |
578 | #endif | |
579 | nop | |
580 | ___ | |
581 | } | |
582 | $code.=<<___; | |
4ddacd99 | 583 | wr %o4, %g0, %ccr |
68c06bf6 AP |
584 | .word 0x81b02920+$NUM-1 ! montmul $NUM-1 |
585 | fbu,pn %fcc3,.Labort_$NUM | |
586 | #ifndef __arch64__ | |
587 | and %fp,$sentinel,$sentinel | |
588 | brz,pn $sentinel,.Labort_$NUM | |
589 | #endif | |
68c06bf6 | 590 | |
4ddacd99 | 591 | srax $pwr, 32, %o4 |
68c06bf6 | 592 | #ifdef __arch64__ |
4ddacd99 | 593 | brgez %o4,.Lstride_$NUM |
68c06bf6 AP |
594 | restore |
595 | restore | |
596 | restore | |
597 | restore | |
598 | restore | |
599 | #else | |
4ddacd99 | 600 | brgez %o4,.Lstride_$NUM |
68c06bf6 AP |
601 | restore; and %fp,$sentinel,$sentinel |
602 | restore; and %fp,$sentinel,$sentinel | |
603 | restore; and %fp,$sentinel,$sentinel | |
604 | restore; and %fp,$sentinel,$sentinel | |
605 | brz,pn $sentinel,.Labort1_$NUM | |
606 | restore | |
607 | #endif | |
608 | ___ | |
609 | \f | |
610 | # save tp[$NUM] ######################################################## | |
611 | for($i=0; $i<14 && $i<$NUM; $i++) { | |
612 | $code.=<<___; | |
613 | movxtod @A[$i],@R[$i] | |
614 | ___ | |
615 | } | |
616 | $code.=<<___; | |
617 | #ifdef __arch64__ | |
618 | restore | |
619 | #else | |
620 | and %fp,$sentinel,$sentinel | |
621 | restore | |
622 | and $sentinel,1,%o7 | |
623 | and %fp,$sentinel,$sentinel | |
624 | srl %fp,0,%fp ! just in case? | |
625 | or %o7,$sentinel,$sentinel | |
626 | brz,a,pn $sentinel,.Ldone_$NUM | |
627 | mov 0,%i0 ! return failure | |
628 | #endif | |
629 | ___ | |
630 | for($i=0; $i<$NUM; $i++) { | |
631 | $code.=<<___; | |
632 | std @R[$i],[$tp+$i*8] | |
633 | ___ | |
634 | } | |
635 | $code.=<<___; | |
636 | mov 1,%i0 ! return success | |
637 | .Ldone_$NUM: | |
638 | ret | |
639 | restore | |
640 | ||
641 | .Labort_$NUM: | |
642 | restore | |
643 | restore | |
644 | restore | |
645 | restore | |
646 | restore | |
647 | .Labort1_$NUM: | |
648 | restore | |
649 | ||
650 | mov 0,%i0 ! return failure | |
651 | ret | |
652 | restore | |
653 | .type bn_pwr5_mont_t4_$NUM, #function | |
654 | .size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM | |
655 | ___ | |
656 | } | |
657 | ||
658 | for ($i=8;$i<=32;$i+=8) { | |
659 | &generate_bn_pwr5_mont_t4($i); | |
660 | } | |
661 | \f | |
662 | { | |
663 | ######################################################################## | |
664 | # Fall-back subroutines | |
665 | # | |
666 | # copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values | |
667 | # | |
668 | ($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)= | |
669 | (map("%g$_",(1..5)),map("%o$_",(0..5,7))); | |
670 | ||
671 | # int bn_mul_mont( | |
672 | $rp="%o0"; # u64 *rp, | |
673 | $ap="%o1"; # const u64 *ap, | |
674 | $bp="%o2"; # const u64 *bp, | |
675 | $np="%o3"; # const u64 *np, | |
676 | $n0p="%o4"; # const BN_ULONG *n0, | |
677 | $num="%o5"; # int num); # caller ensures that num is >=3 | |
678 | $code.=<<___; | |
679 | .globl bn_mul_mont_t4 | |
680 | .align 32 | |
681 | bn_mul_mont_t4: | |
682 | add %sp, STACK_BIAS, %g4 ! real top of stack | |
683 | sll $num, 3, $num ! size in bytes | |
684 | add $num, 63, %g1 | |
685 | andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes | |
686 | sub %g4, %g1, %g1 | |
687 | andn %g1, 63, %g1 ! align at 64 byte | |
688 | sub %g1, STACK_FRAME, %g1 ! new top of stack | |
689 | sub %g1, %g4, %g1 | |
690 | ||
691 | save %sp, %g1, %sp | |
692 | ___ | |
693 | # +-------------------------------+<----- %sp | |
694 | # . . | |
695 | # +-------------------------------+<----- aligned at 64 bytes | |
696 | # | __int64 tmp[0] | | |
697 | # +-------------------------------+ | |
698 | # . . | |
699 | # . . | |
700 | # +-------------------------------+<----- aligned at 64 bytes | |
701 | # . . | |
702 | ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); | |
703 | ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7)); | |
704 | ($ovf,$i)=($t0,$t1); | |
705 | $code.=<<___; | |
706 | ld [$n0p+0], $t0 ! pull n0[0..1] value | |
707 | ld [$n0p+4], $t1 | |
708 | add %sp, STACK_BIAS+STACK_FRAME, $tp | |
709 | ldx [$bp+0], $m0 ! m0=bp[0] | |
710 | sllx $t1, 32, $n0 | |
711 | add $bp, 8, $bp | |
712 | or $t0, $n0, $n0 | |
713 | \f | |
714 | ldx [$ap+0], $aj ! ap[0] | |
715 | ||
716 | mulx $aj, $m0, $lo0 ! ap[0]*bp[0] | |
717 | umulxhi $aj, $m0, $hi0 | |
718 | ||
719 | ldx [$ap+8], $aj ! ap[1] | |
720 | add $ap, 16, $ap | |
721 | ldx [$np+0], $nj ! np[0] | |
722 | ||
723 | mulx $lo0, $n0, $m1 ! "tp[0]"*n0 | |
724 | ||
725 | mulx $aj, $m0, $alo ! ap[1]*bp[0] | |
726 | umulxhi $aj, $m0, $aj ! ahi=aj | |
727 | ||
728 | mulx $nj, $m1, $lo1 ! np[0]*m1 | |
729 | umulxhi $nj, $m1, $hi1 | |
730 | ||
731 | ldx [$np+8], $nj ! np[1] | |
732 | ||
733 | addcc $lo0, $lo1, $lo1 | |
734 | add $np, 16, $np | |
735 | addxc %g0, $hi1, $hi1 | |
736 | ||
737 | mulx $nj, $m1, $nlo ! np[1]*m1 | |
738 | umulxhi $nj, $m1, $nj ! nhi=nj | |
739 | \f | |
740 | ba .L1st | |
741 | sub $num, 24, $cnt ! cnt=num-3 | |
742 | ||
743 | .align 16 | |
744 | .L1st: | |
745 | addcc $alo, $hi0, $lo0 | |
746 | addxc $aj, %g0, $hi0 | |
747 | ||
748 | ldx [$ap+0], $aj ! ap[j] | |
749 | addcc $nlo, $hi1, $lo1 | |
750 | add $ap, 8, $ap | |
751 | addxc $nj, %g0, $hi1 ! nhi=nj | |
752 | ||
753 | ldx [$np+0], $nj ! np[j] | |
754 | mulx $aj, $m0, $alo ! ap[j]*bp[0] | |
755 | add $np, 8, $np | |
756 | umulxhi $aj, $m0, $aj ! ahi=aj | |
757 | ||
758 | mulx $nj, $m1, $nlo ! np[j]*m1 | |
759 | addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] | |
760 | umulxhi $nj, $m1, $nj ! nhi=nj | |
761 | addxc %g0, $hi1, $hi1 | |
762 | stxa $lo1, [$tp]0xe2 ! tp[j-1] | |
763 | add $tp, 8, $tp ! tp++ | |
764 | ||
765 | brnz,pt $cnt, .L1st | |
766 | sub $cnt, 8, $cnt ! j-- | |
767 | !.L1st | |
768 | addcc $alo, $hi0, $lo0 | |
769 | addxc $aj, %g0, $hi0 ! ahi=aj | |
770 | ||
771 | addcc $nlo, $hi1, $lo1 | |
772 | addxc $nj, %g0, $hi1 | |
773 | addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] | |
774 | addxc %g0, $hi1, $hi1 | |
775 | stxa $lo1, [$tp]0xe2 ! tp[j-1] | |
776 | add $tp, 8, $tp | |
777 | ||
778 | addcc $hi0, $hi1, $hi1 | |
779 | addxc %g0, %g0, $ovf ! upmost overflow bit | |
780 | stxa $hi1, [$tp]0xe2 | |
781 | add $tp, 8, $tp | |
782 | \f | |
783 | ba .Louter | |
784 | sub $num, 16, $i ! i=num-2 | |
785 | ||
786 | .align 16 | |
787 | .Louter: | |
788 | ldx [$bp+0], $m0 ! m0=bp[i] | |
789 | add $bp, 8, $bp | |
790 | ||
791 | sub $ap, $num, $ap ! rewind | |
792 | sub $np, $num, $np | |
793 | sub $tp, $num, $tp | |
794 | ||
795 | ldx [$ap+0], $aj ! ap[0] | |
796 | ldx [$np+0], $nj ! np[0] | |
797 | ||
798 | mulx $aj, $m0, $lo0 ! ap[0]*bp[i] | |
799 | ldx [$tp], $tj ! tp[0] | |
800 | umulxhi $aj, $m0, $hi0 | |
801 | ldx [$ap+8], $aj ! ap[1] | |
802 | addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] | |
803 | mulx $aj, $m0, $alo ! ap[1]*bp[i] | |
804 | addxc %g0, $hi0, $hi0 | |
805 | mulx $lo0, $n0, $m1 ! tp[0]*n0 | |
806 | umulxhi $aj, $m0, $aj ! ahi=aj | |
807 | mulx $nj, $m1, $lo1 ! np[0]*m1 | |
808 | add $ap, 16, $ap | |
809 | umulxhi $nj, $m1, $hi1 | |
810 | ldx [$np+8], $nj ! np[1] | |
811 | add $np, 16, $np | |
812 | addcc $lo1, $lo0, $lo1 | |
813 | mulx $nj, $m1, $nlo ! np[1]*m1 | |
814 | addxc %g0, $hi1, $hi1 | |
815 | umulxhi $nj, $m1, $nj ! nhi=nj | |
816 | \f | |
817 | ba .Linner | |
818 | sub $num, 24, $cnt ! cnt=num-3 | |
819 | .align 16 | |
820 | .Linner: | |
821 | addcc $alo, $hi0, $lo0 | |
822 | ldx [$tp+8], $tj ! tp[j] | |
823 | addxc $aj, %g0, $hi0 ! ahi=aj | |
824 | ldx [$ap+0], $aj ! ap[j] | |
825 | add $ap, 8, $ap | |
826 | addcc $nlo, $hi1, $lo1 | |
827 | mulx $aj, $m0, $alo ! ap[j]*bp[i] | |
828 | addxc $nj, %g0, $hi1 ! nhi=nj | |
829 | ldx [$np+0], $nj ! np[j] | |
830 | add $np, 8, $np | |
831 | umulxhi $aj, $m0, $aj ! ahi=aj | |
832 | addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] | |
833 | mulx $nj, $m1, $nlo ! np[j]*m1 | |
834 | addxc %g0, $hi0, $hi0 | |
835 | umulxhi $nj, $m1, $nj ! nhi=nj | |
836 | addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] | |
837 | addxc %g0, $hi1, $hi1 | |
838 | stx $lo1, [$tp] ! tp[j-1] | |
839 | add $tp, 8, $tp | |
840 | brnz,pt $cnt, .Linner | |
841 | sub $cnt, 8, $cnt | |
842 | !.Linner | |
843 | ldx [$tp+8], $tj ! tp[j] | |
844 | addcc $alo, $hi0, $lo0 | |
845 | addxc $aj, %g0, $hi0 ! ahi=aj | |
846 | addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] | |
847 | addxc %g0, $hi0, $hi0 | |
848 | ||
849 | addcc $nlo, $hi1, $lo1 | |
850 | addxc $nj, %g0, $hi1 ! nhi=nj | |
851 | addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] | |
852 | addxc %g0, $hi1, $hi1 | |
853 | stx $lo1, [$tp] ! tp[j-1] | |
854 | ||
855 | subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc | |
856 | addxccc $hi1, $hi0, $hi1 | |
857 | addxc %g0, %g0, $ovf | |
858 | stx $hi1, [$tp+8] | |
859 | add $tp, 16, $tp | |
860 | ||
861 | brnz,pt $i, .Louter | |
862 | sub $i, 8, $i | |
863 | \f | |
864 | sub $ap, $num, $ap ! rewind | |
865 | sub $np, $num, $np | |
866 | sub $tp, $num, $tp | |
867 | ba .Lsub | |
868 | subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc | |
869 | ||
870 | .align 16 | |
871 | .Lsub: | |
872 | ldx [$tp], $tj | |
873 | add $tp, 8, $tp | |
874 | ldx [$np+0], $nj | |
875 | add $np, 8, $np | |
876 | subccc $tj, $nj, $t2 ! tp[j]-np[j] | |
877 | srlx $tj, 32, $tj | |
878 | srlx $nj, 32, $nj | |
879 | subccc $tj, $nj, $t3 | |
880 | add $rp, 8, $rp | |
881 | st $t2, [$rp-4] ! reverse order | |
882 | st $t3, [$rp-8] | |
883 | brnz,pt $cnt, .Lsub | |
884 | sub $cnt, 8, $cnt | |
885 | ||
886 | sub $np, $num, $np ! rewind | |
887 | sub $tp, $num, $tp | |
888 | sub $rp, $num, $rp | |
889 | ||
774ff8fe | 890 | subccc $ovf, %g0, $ovf ! handle upmost overflow bit |
68c06bf6 AP |
891 | ba .Lcopy |
892 | sub $num, 8, $cnt | |
893 | ||
894 | .align 16 | |
774ff8fe AP |
895 | .Lcopy: ! conditional copy |
896 | ldx [$tp], $tj | |
897 | ldx [$rp+0], $t2 | |
68c06bf6 AP |
898 | stx %g0, [$tp] ! zap |
899 | add $tp, 8, $tp | |
774ff8fe | 900 | movcs %icc, $tj, $t2 |
68c06bf6 AP |
901 | stx $t2, [$rp+0] |
902 | add $rp, 8, $rp | |
903 | brnz $cnt, .Lcopy | |
904 | sub $cnt, 8, $cnt | |
905 | ||
906 | mov 1, %o0 | |
907 | ret | |
908 | restore | |
909 | .type bn_mul_mont_t4, #function | |
910 | .size bn_mul_mont_t4, .-bn_mul_mont_t4 | |
911 | ___ | |
912 | \f | |
913 | # int bn_mul_mont_gather5( | |
914 | $rp="%o0"; # u64 *rp, | |
915 | $ap="%o1"; # const u64 *ap, | |
916 | $bp="%o2"; # const u64 *pwrtbl, | |
917 | $np="%o3"; # const u64 *np, | |
918 | $n0p="%o4"; # const BN_ULONG *n0, | |
919 | $num="%o5"; # int num, # caller ensures that num is >=3 | |
920 | # int power); | |
921 | $code.=<<___; | |
922 | .globl bn_mul_mont_gather5_t4 | |
923 | .align 32 | |
924 | bn_mul_mont_gather5_t4: | |
925 | add %sp, STACK_BIAS, %g4 ! real top of stack | |
926 | sll $num, 3, $num ! size in bytes | |
927 | add $num, 63, %g1 | |
928 | andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes | |
929 | sub %g4, %g1, %g1 | |
930 | andn %g1, 63, %g1 ! align at 64 byte | |
931 | sub %g1, STACK_FRAME, %g1 ! new top of stack | |
932 | sub %g1, %g4, %g1 | |
933 | LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument | |
934 | ||
935 | save %sp, %g1, %sp | |
936 | ___ | |
937 | # +-------------------------------+<----- %sp | |
938 | # . . | |
939 | # +-------------------------------+<----- aligned at 64 bytes | |
940 | # | __int64 tmp[0] | | |
941 | # +-------------------------------+ | |
942 | # . . | |
943 | # . . | |
944 | # +-------------------------------+<----- aligned at 64 bytes | |
945 | # . . | |
946 | ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); | |
4ddacd99 | 947 | ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7)); |
68c06bf6 | 948 | ($ovf,$i)=($t0,$t1); |
4ddacd99 AP |
949 | &load_ccr($bp,"%g4",$ccr); |
950 | &load_b($bp,$m0,"%o7"); # m0=bp[0] | |
68c06bf6 | 951 | |
4ddacd99 | 952 | $code.=<<___; |
68c06bf6 AP |
953 | ld [$n0p+0], $t0 ! pull n0[0..1] value |
954 | ld [$n0p+4], $t1 | |
955 | add %sp, STACK_BIAS+STACK_FRAME, $tp | |
956 | sllx $t1, 32, $n0 | |
957 | or $t0, $n0, $n0 | |
958 | \f | |
959 | ldx [$ap+0], $aj ! ap[0] | |
960 | ||
961 | mulx $aj, $m0, $lo0 ! ap[0]*bp[0] | |
962 | umulxhi $aj, $m0, $hi0 | |
963 | ||
964 | ldx [$ap+8], $aj ! ap[1] | |
965 | add $ap, 16, $ap | |
966 | ldx [$np+0], $nj ! np[0] | |
967 | ||
968 | mulx $lo0, $n0, $m1 ! "tp[0]"*n0 | |
969 | ||
970 | mulx $aj, $m0, $alo ! ap[1]*bp[0] | |
971 | umulxhi $aj, $m0, $aj ! ahi=aj | |
972 | ||
973 | mulx $nj, $m1, $lo1 ! np[0]*m1 | |
974 | umulxhi $nj, $m1, $hi1 | |
975 | ||
976 | ldx [$np+8], $nj ! np[1] | |
977 | ||
978 | addcc $lo0, $lo1, $lo1 | |
979 | add $np, 16, $np | |
980 | addxc %g0, $hi1, $hi1 | |
981 | ||
982 | mulx $nj, $m1, $nlo ! np[1]*m1 | |
983 | umulxhi $nj, $m1, $nj ! nhi=nj | |
984 | \f | |
985 | ba .L1st_g5 | |
986 | sub $num, 24, $cnt ! cnt=num-3 | |
987 | ||
988 | .align 16 | |
989 | .L1st_g5: | |
990 | addcc $alo, $hi0, $lo0 | |
991 | addxc $aj, %g0, $hi0 | |
992 | ||
993 | ldx [$ap+0], $aj ! ap[j] | |
994 | addcc $nlo, $hi1, $lo1 | |
995 | add $ap, 8, $ap | |
996 | addxc $nj, %g0, $hi1 ! nhi=nj | |
997 | ||
998 | ldx [$np+0], $nj ! np[j] | |
999 | mulx $aj, $m0, $alo ! ap[j]*bp[0] | |
1000 | add $np, 8, $np | |
1001 | umulxhi $aj, $m0, $aj ! ahi=aj | |
1002 | ||
1003 | mulx $nj, $m1, $nlo ! np[j]*m1 | |
1004 | addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] | |
1005 | umulxhi $nj, $m1, $nj ! nhi=nj | |
1006 | addxc %g0, $hi1, $hi1 | |
1007 | stxa $lo1, [$tp]0xe2 ! tp[j-1] | |
1008 | add $tp, 8, $tp ! tp++ | |
1009 | ||
1010 | brnz,pt $cnt, .L1st_g5 | |
1011 | sub $cnt, 8, $cnt ! j-- | |
1012 | !.L1st_g5 | |
1013 | addcc $alo, $hi0, $lo0 | |
1014 | addxc $aj, %g0, $hi0 ! ahi=aj | |
1015 | ||
1016 | addcc $nlo, $hi1, $lo1 | |
1017 | addxc $nj, %g0, $hi1 | |
1018 | addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] | |
1019 | addxc %g0, $hi1, $hi1 | |
1020 | stxa $lo1, [$tp]0xe2 ! tp[j-1] | |
1021 | add $tp, 8, $tp | |
1022 | ||
1023 | addcc $hi0, $hi1, $hi1 | |
1024 | addxc %g0, %g0, $ovf ! upmost overflow bit | |
1025 | stxa $hi1, [$tp]0xe2 | |
1026 | add $tp, 8, $tp | |
1027 | \f | |
1028 | ba .Louter_g5 | |
1029 | sub $num, 16, $i ! i=num-2 | |
1030 | ||
1031 | .align 16 | |
1032 | .Louter_g5: | |
4ddacd99 | 1033 | wr $ccr, %g0, %ccr |
68c06bf6 | 1034 | ___ |
4ddacd99 | 1035 | &load_b($bp,$m0); # m0=bp[i] |
68c06bf6 | 1036 | $code.=<<___; |
68c06bf6 AP |
1037 | sub $ap, $num, $ap ! rewind |
1038 | sub $np, $num, $np | |
1039 | sub $tp, $num, $tp | |
1040 | ||
1041 | ldx [$ap+0], $aj ! ap[0] | |
1042 | ldx [$np+0], $nj ! np[0] | |
1043 | ||
1044 | mulx $aj, $m0, $lo0 ! ap[0]*bp[i] | |
1045 | ldx [$tp], $tj ! tp[0] | |
1046 | umulxhi $aj, $m0, $hi0 | |
1047 | ldx [$ap+8], $aj ! ap[1] | |
1048 | addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] | |
1049 | mulx $aj, $m0, $alo ! ap[1]*bp[i] | |
1050 | addxc %g0, $hi0, $hi0 | |
1051 | mulx $lo0, $n0, $m1 ! tp[0]*n0 | |
1052 | umulxhi $aj, $m0, $aj ! ahi=aj | |
1053 | mulx $nj, $m1, $lo1 ! np[0]*m1 | |
1054 | add $ap, 16, $ap | |
1055 | umulxhi $nj, $m1, $hi1 | |
1056 | ldx [$np+8], $nj ! np[1] | |
1057 | add $np, 16, $np | |
1058 | addcc $lo1, $lo0, $lo1 | |
1059 | mulx $nj, $m1, $nlo ! np[1]*m1 | |
1060 | addxc %g0, $hi1, $hi1 | |
1061 | umulxhi $nj, $m1, $nj ! nhi=nj | |
1062 | \f | |
1063 | ba .Linner_g5 | |
1064 | sub $num, 24, $cnt ! cnt=num-3 | |
1065 | .align 16 | |
1066 | .Linner_g5: | |
1067 | addcc $alo, $hi0, $lo0 | |
1068 | ldx [$tp+8], $tj ! tp[j] | |
1069 | addxc $aj, %g0, $hi0 ! ahi=aj | |
1070 | ldx [$ap+0], $aj ! ap[j] | |
1071 | add $ap, 8, $ap | |
1072 | addcc $nlo, $hi1, $lo1 | |
1073 | mulx $aj, $m0, $alo ! ap[j]*bp[i] | |
1074 | addxc $nj, %g0, $hi1 ! nhi=nj | |
1075 | ldx [$np+0], $nj ! np[j] | |
1076 | add $np, 8, $np | |
1077 | umulxhi $aj, $m0, $aj ! ahi=aj | |
1078 | addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] | |
1079 | mulx $nj, $m1, $nlo ! np[j]*m1 | |
1080 | addxc %g0, $hi0, $hi0 | |
1081 | umulxhi $nj, $m1, $nj ! nhi=nj | |
1082 | addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] | |
1083 | addxc %g0, $hi1, $hi1 | |
1084 | stx $lo1, [$tp] ! tp[j-1] | |
1085 | add $tp, 8, $tp | |
1086 | brnz,pt $cnt, .Linner_g5 | |
1087 | sub $cnt, 8, $cnt | |
1088 | !.Linner_g5 | |
1089 | ldx [$tp+8], $tj ! tp[j] | |
1090 | addcc $alo, $hi0, $lo0 | |
1091 | addxc $aj, %g0, $hi0 ! ahi=aj | |
1092 | addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] | |
1093 | addxc %g0, $hi0, $hi0 | |
1094 | ||
1095 | addcc $nlo, $hi1, $lo1 | |
1096 | addxc $nj, %g0, $hi1 ! nhi=nj | |
1097 | addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] | |
1098 | addxc %g0, $hi1, $hi1 | |
1099 | stx $lo1, [$tp] ! tp[j-1] | |
1100 | ||
1101 | subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc | |
1102 | addxccc $hi1, $hi0, $hi1 | |
1103 | addxc %g0, %g0, $ovf | |
1104 | stx $hi1, [$tp+8] | |
1105 | add $tp, 16, $tp | |
1106 | ||
1107 | brnz,pt $i, .Louter_g5 | |
1108 | sub $i, 8, $i | |
1109 | \f | |
1110 | sub $ap, $num, $ap ! rewind | |
1111 | sub $np, $num, $np | |
1112 | sub $tp, $num, $tp | |
1113 | ba .Lsub_g5 | |
1114 | subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc | |
1115 | ||
1116 | .align 16 | |
1117 | .Lsub_g5: | |
1118 | ldx [$tp], $tj | |
1119 | add $tp, 8, $tp | |
1120 | ldx [$np+0], $nj | |
1121 | add $np, 8, $np | |
1122 | subccc $tj, $nj, $t2 ! tp[j]-np[j] | |
1123 | srlx $tj, 32, $tj | |
1124 | srlx $nj, 32, $nj | |
1125 | subccc $tj, $nj, $t3 | |
1126 | add $rp, 8, $rp | |
1127 | st $t2, [$rp-4] ! reverse order | |
1128 | st $t3, [$rp-8] | |
1129 | brnz,pt $cnt, .Lsub_g5 | |
1130 | sub $cnt, 8, $cnt | |
1131 | ||
1132 | sub $np, $num, $np ! rewind | |
1133 | sub $tp, $num, $tp | |
1134 | sub $rp, $num, $rp | |
1135 | ||
774ff8fe | 1136 | subccc $ovf, %g0, $ovf ! handle upmost overflow bit |
68c06bf6 AP |
1137 | ba .Lcopy_g5 |
1138 | sub $num, 8, $cnt | |
1139 | ||
1140 | .align 16 | |
774ff8fe AP |
1141 | .Lcopy_g5: ! conditional copy |
1142 | ldx [$tp], $tj | |
1143 | ldx [$rp+0], $t2 | |
68c06bf6 AP |
1144 | stx %g0, [$tp] ! zap |
1145 | add $tp, 8, $tp | |
774ff8fe | 1146 | movcs %icc, $tj, $t2 |
68c06bf6 AP |
1147 | stx $t2, [$rp+0] |
1148 | add $rp, 8, $rp | |
1149 | brnz $cnt, .Lcopy_g5 | |
1150 | sub $cnt, 8, $cnt | |
1151 | ||
1152 | mov 1, %o0 | |
1153 | ret | |
1154 | restore | |
1155 | .type bn_mul_mont_gather5_t4, #function | |
1156 | .size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4 | |
1157 | ___ | |
1158 | } | |
1159 | \f | |
1160 | $code.=<<___; | |
1161 | .globl bn_flip_t4 | |
1162 | .align 32 | |
1163 | bn_flip_t4: | |
1164 | .Loop_flip: | |
1165 | ld [%o1+0], %o4 | |
1166 | sub %o2, 1, %o2 | |
1167 | ld [%o1+4], %o5 | |
1168 | add %o1, 8, %o1 | |
1169 | st %o5, [%o0+0] | |
1170 | st %o4, [%o0+4] | |
1171 | brnz %o2, .Loop_flip | |
1172 | add %o0, 8, %o0 | |
1173 | retl | |
1174 | nop | |
1175 | .type bn_flip_t4, #function | |
1176 | .size bn_flip_t4, .-bn_flip_t4 | |
1177 | ||
4ddacd99 | 1178 | .globl bn_flip_n_scatter5_t4 |
68c06bf6 | 1179 | .align 32 |
4ddacd99 | 1180 | bn_flip_n_scatter5_t4: |
68c06bf6 | 1181 | sll %o3, 3, %o3 |
4ddacd99 | 1182 | srl %o1, 1, %o1 |
68c06bf6 | 1183 | add %o3, %o2, %o2 ! &pwrtbl[pwr] |
4ddacd99 AP |
1184 | sub %o1, 1, %o1 |
1185 | .Loop_flip_n_scatter5: | |
1186 | ld [%o0+0], %o4 ! inp[i] | |
1187 | ld [%o0+4], %o5 | |
68c06bf6 | 1188 | add %o0, 8, %o0 |
4ddacd99 AP |
1189 | sllx %o5, 32, %o5 |
1190 | or %o4, %o5, %o5 | |
1191 | stx %o5, [%o2] | |
68c06bf6 | 1192 | add %o2, 32*8, %o2 |
4ddacd99 | 1193 | brnz %o1, .Loop_flip_n_scatter5 |
68c06bf6 AP |
1194 | sub %o1, 1, %o1 |
1195 | retl | |
1196 | nop | |
4ddacd99 AP |
1197 | .type bn_flip_n_scatter5_t4, #function |
1198 | .size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4 | |
68c06bf6 AP |
1199 | |
1200 | .globl bn_gather5_t4 | |
1201 | .align 32 | |
1202 | bn_gather5_t4: | |
68c06bf6 | 1203 | ___ |
4ddacd99 | 1204 | &load_ccr("%o2","%o3","%g1"); |
68c06bf6 | 1205 | $code.=<<___; |
68c06bf6 AP |
1206 | sub %o1, 1, %o1 |
1207 | .Loop_gather5: | |
1208 | ___ | |
4ddacd99 | 1209 | &load_b("%o2","%g1"); |
68c06bf6 | 1210 | $code.=<<___; |
4ddacd99 | 1211 | stx %g1, [%o0] |
68c06bf6 AP |
1212 | add %o0, 8, %o0 |
1213 | brnz %o1, .Loop_gather5 | |
1214 | sub %o1, 1, %o1 | |
1215 | ||
1216 | retl | |
1217 | nop | |
1218 | .type bn_gather5_t4, #function | |
1219 | .size bn_gather5_t4, .-bn_gather5_t4 | |
4ddacd99 | 1220 | |
68c06bf6 AP |
1221 | .asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov" |
1222 | .align 4 | |
1223 | ___ | |
1224 | ||
1225 | &emit_assembler(); | |
1226 | ||
a21314db | 1227 | close STDOUT or die "error closing STDOUT: $!"; |