]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
33388b44 | 2 | # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
367ace68 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
09854736 AP |
9 | # |
10 | # ==================================================================== | |
d4665887 | 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
09854736 AP |
12 | # project. The module is, however, dual licensed under OpenSSL and |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | ||
17 | # This module doesn't present direct interest for OpenSSL, because it | |
d4665887 AP |
18 | # doesn't provide better performance for longer keys, at least not on |
19 | # in-order-execution cores. While 512-bit RSA sign operations can be | |
20 | # 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and | |
21 | # 4096-bit ones are up to 15% slower. In 32-bit mode it varies from | |
22 | # 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA | |
23 | # verify:-( All comparisons are against bn_mul_mont-free assembler. | |
24 | # The module might be of interest to embedded system developers, as | |
25 | # the code is smaller than 1KB, yet offers >3x improvement on MIPS64 | |
26 | # and 75-30% [less for longer keys] on MIPS32 over compiler-generated | |
27 | # code. | |
09854736 AP |
28 | |
29 | ###################################################################### | |
30 | # There is a number of MIPS ABI in use, O32 and N32/64 are most | |
31 | # widely used. Then there is a new contender: NUBI. It appears that if | |
32 | # one picks the latter, it's possible to arrange code in ABI neutral | |
33 | # manner. Therefore let's stick to NUBI register layout: | |
34 | # | |
35 | ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); | |
36 | ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | |
37 | ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); | |
38 | ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); | |
39 | # | |
40 | # The return value is placed in $a0. Following coding rules facilitate | |
41 | # interoperability: | |
42 | # | |
43 | # - never ever touch $tp, "thread pointer", former $gp; | |
44 | # - copy return value to $t0, former $v0 [or to $a0 if you're adapting | |
45 | # old code]; | |
46 | # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; | |
47 | # | |
48 | # For reference here is register layout for N32/64 MIPS ABIs: | |
49 | # | |
50 | # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); | |
51 | # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | |
52 | # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); | |
53 | # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); | |
54 | # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); | |
1aa89a7a RL |
55 | |
56 | # $output is the last argument if it looks like a file (it has an extension) | |
57 | # $flavour is the first argument if it doesn't look like a file | |
58 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
59 | # supported flavours are o32,n32,64,nubi32,nubi64, default is o32 | |
60 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : "o32"; | |
09854736 AP |
61 | |
62 | if ($flavour =~ /64|n32/i) { | |
947716c1 AP |
63 | $PTR_ADD="daddu"; # incidentally works even on n32 |
64 | $PTR_SUB="dsubu"; # incidentally works even on n32 | |
09854736 AP |
65 | $REG_S="sd"; |
66 | $REG_L="ld"; | |
67 | $SZREG=8; | |
68 | } else { | |
947716c1 AP |
69 | $PTR_ADD="addu"; |
70 | $PTR_SUB="subu"; | |
09854736 AP |
71 | $REG_S="sw"; |
72 | $REG_L="lw"; | |
73 | $SZREG=4; | |
74 | } | |
75 | $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000; | |
76 | # | |
77 | # <appro@openssl.org> | |
78 | # | |
79 | ###################################################################### | |
80 | ||
1aa89a7a | 81 | $output and open STDOUT,">$output"; |
09854736 AP |
82 | |
83 | if ($flavour =~ /64|n32/i) { | |
84 | $LD="ld"; | |
85 | $ST="sd"; | |
86 | $MULTU="dmultu"; | |
87 | $ADDU="daddu"; | |
88 | $SUBU="dsubu"; | |
89 | $BNSZ=8; | |
90 | } else { | |
91 | $LD="lw"; | |
92 | $ST="sw"; | |
93 | $MULTU="multu"; | |
94 | $ADDU="addu"; | |
95 | $SUBU="subu"; | |
96 | $BNSZ=4; | |
97 | } | |
98 | ||
99 | # int bn_mul_mont( | |
100 | $rp=$a0; # BN_ULONG *rp, | |
101 | $ap=$a1; # const BN_ULONG *ap, | |
102 | $bp=$a2; # const BN_ULONG *bp, | |
103 | $np=$a3; # const BN_ULONG *np, | |
104 | $n0=$a4; # const BN_ULONG *n0, | |
105 | $num=$a5; # int num); | |
106 | ||
107 | $lo0=$a6; | |
108 | $hi0=$a7; | |
109 | $lo1=$t1; | |
110 | $hi1=$t2; | |
111 | $aj=$s0; | |
112 | $bi=$s1; | |
113 | $nj=$s2; | |
114 | $tp=$s3; | |
115 | $alo=$s4; | |
116 | $ahi=$s5; | |
117 | $nlo=$s6; | |
118 | $nhi=$s7; | |
119 | $tj=$s8; | |
120 | $i=$s9; | |
121 | $j=$s10; | |
122 | $m1=$s11; | |
123 | ||
124 | $FRAMESIZE=14; | |
125 | ||
126 | $code=<<___; | |
947716c1 AP |
127 | #include "mips_arch.h" |
128 | ||
09854736 AP |
129 | .text |
130 | ||
131 | .set noat | |
132 | .set noreorder | |
133 | ||
134 | .align 5 | |
135 | .globl bn_mul_mont | |
136 | .ent bn_mul_mont | |
137 | bn_mul_mont: | |
138 | ___ | |
139 | $code.=<<___ if ($flavour =~ /o32/i); | |
140 | lw $n0,16($sp) | |
141 | lw $num,20($sp) | |
142 | ___ | |
143 | $code.=<<___; | |
144 | slt $at,$num,4 | |
d4665887 | 145 | bnez $at,1f |
09854736 | 146 | li $t0,0 |
d4665887 | 147 | slt $at,$num,17 # on in-order CPU |
0c2adb0a | 148 | bnez $at,bn_mul_mont_internal |
d4665887 AP |
149 | nop |
150 | 1: jr $ra | |
09854736 AP |
151 | li $a0,0 |
152 | .end bn_mul_mont | |
153 | ||
154 | .align 5 | |
155 | .ent bn_mul_mont_internal | |
156 | bn_mul_mont_internal: | |
157 | .frame $fp,$FRAMESIZE*$SZREG,$ra | |
158 | .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG | |
159 | $PTR_SUB $sp,$FRAMESIZE*$SZREG | |
160 | $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp) | |
161 | $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp) | |
162 | $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp) | |
163 | $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp) | |
164 | $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp) | |
165 | $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp) | |
166 | $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp) | |
167 | $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp) | |
168 | $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp) | |
169 | ___ | |
170 | $code.=<<___ if ($flavour =~ /nubi/i); | |
171 | $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp) | |
172 | $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp) | |
173 | $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp) | |
174 | $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp) | |
175 | ___ | |
176 | $code.=<<___; | |
177 | move $fp,$sp | |
178 | ||
179 | .set reorder | |
180 | $LD $n0,0($n0) | |
181 | $LD $bi,0($bp) # bp[0] | |
182 | $LD $aj,0($ap) # ap[0] | |
183 | $LD $nj,0($np) # np[0] | |
184 | ||
185 | $PTR_SUB $sp,2*$BNSZ # place for two extra words | |
186 | sll $num,`log($BNSZ)/log(2)` | |
187 | li $at,-4096 | |
188 | $PTR_SUB $sp,$num | |
189 | and $sp,$at | |
190 | ||
947716c1 AP |
191 | $MULTU ($aj,$bi) |
192 | $LD $ahi,$BNSZ($ap) | |
193 | $LD $nhi,$BNSZ($np) | |
194 | mflo ($lo0,$aj,$bi) | |
195 | mfhi ($hi0,$aj,$bi) | |
196 | $MULTU ($lo0,$n0) | |
197 | mflo ($m1,$lo0,$n0) | |
198 | ||
199 | $MULTU ($ahi,$bi) | |
200 | mflo ($alo,$ahi,$bi) | |
201 | mfhi ($ahi,$ahi,$bi) | |
202 | ||
203 | $MULTU ($nj,$m1) | |
204 | mflo ($lo1,$nj,$m1) | |
205 | mfhi ($hi1,$nj,$m1) | |
206 | $MULTU ($nhi,$m1) | |
09854736 AP |
207 | $ADDU $lo1,$lo0 |
208 | sltu $at,$lo1,$lo0 | |
209 | $ADDU $hi1,$at | |
947716c1 AP |
210 | mflo ($nlo,$nhi,$m1) |
211 | mfhi ($nhi,$nhi,$m1) | |
09854736 AP |
212 | |
213 | move $tp,$sp | |
214 | li $j,2*$BNSZ | |
215 | .align 4 | |
216 | .L1st: | |
217 | .set noreorder | |
218 | $PTR_ADD $aj,$ap,$j | |
219 | $PTR_ADD $nj,$np,$j | |
220 | $LD $aj,($aj) | |
221 | $LD $nj,($nj) | |
222 | ||
947716c1 | 223 | $MULTU ($aj,$bi) |
09854736 AP |
224 | $ADDU $lo0,$alo,$hi0 |
225 | $ADDU $lo1,$nlo,$hi1 | |
226 | sltu $at,$lo0,$hi0 | |
227 | sltu $t0,$lo1,$hi1 | |
228 | $ADDU $hi0,$ahi,$at | |
229 | $ADDU $hi1,$nhi,$t0 | |
947716c1 AP |
230 | mflo ($alo,$aj,$bi) |
231 | mfhi ($ahi,$aj,$bi) | |
09854736 AP |
232 | |
233 | $ADDU $lo1,$lo0 | |
234 | sltu $at,$lo1,$lo0 | |
947716c1 | 235 | $MULTU ($nj,$m1) |
09854736 AP |
236 | $ADDU $hi1,$at |
237 | addu $j,$BNSZ | |
238 | $ST $lo1,($tp) | |
239 | sltu $t0,$j,$num | |
947716c1 AP |
240 | mflo ($nlo,$nj,$m1) |
241 | mfhi ($nhi,$nj,$m1) | |
09854736 AP |
242 | |
243 | bnez $t0,.L1st | |
244 | $PTR_ADD $tp,$BNSZ | |
245 | .set reorder | |
246 | ||
247 | $ADDU $lo0,$alo,$hi0 | |
248 | sltu $at,$lo0,$hi0 | |
249 | $ADDU $hi0,$ahi,$at | |
250 | ||
251 | $ADDU $lo1,$nlo,$hi1 | |
252 | sltu $t0,$lo1,$hi1 | |
253 | $ADDU $hi1,$nhi,$t0 | |
254 | $ADDU $lo1,$lo0 | |
255 | sltu $at,$lo1,$lo0 | |
256 | $ADDU $hi1,$at | |
257 | ||
258 | $ST $lo1,($tp) | |
259 | ||
260 | $ADDU $hi1,$hi0 | |
261 | sltu $at,$hi1,$hi0 | |
262 | $ST $hi1,$BNSZ($tp) | |
263 | $ST $at,2*$BNSZ($tp) | |
264 | ||
265 | li $i,$BNSZ | |
266 | .align 4 | |
267 | .Louter: | |
268 | $PTR_ADD $bi,$bp,$i | |
269 | $LD $bi,($bi) | |
270 | $LD $aj,($ap) | |
947716c1 | 271 | $LD $ahi,$BNSZ($ap) |
09854736 AP |
272 | $LD $tj,($sp) |
273 | ||
947716c1 | 274 | $MULTU ($aj,$bi) |
09854736 | 275 | $LD $nj,($np) |
947716c1 AP |
276 | $LD $nhi,$BNSZ($np) |
277 | mflo ($lo0,$aj,$bi) | |
278 | mfhi ($hi0,$aj,$bi) | |
09854736 | 279 | $ADDU $lo0,$tj |
947716c1 | 280 | $MULTU ($lo0,$n0) |
09854736 AP |
281 | sltu $at,$lo0,$tj |
282 | $ADDU $hi0,$at | |
947716c1 | 283 | mflo ($m1,$lo0,$n0) |
09854736 | 284 | |
947716c1 AP |
285 | $MULTU ($ahi,$bi) |
286 | mflo ($alo,$ahi,$bi) | |
287 | mfhi ($ahi,$ahi,$bi) | |
09854736 | 288 | |
947716c1 AP |
289 | $MULTU ($nj,$m1) |
290 | mflo ($lo1,$nj,$m1) | |
291 | mfhi ($hi1,$nj,$m1) | |
09854736 | 292 | |
947716c1 | 293 | $MULTU ($nhi,$m1) |
09854736 AP |
294 | $ADDU $lo1,$lo0 |
295 | sltu $at,$lo1,$lo0 | |
296 | $ADDU $hi1,$at | |
947716c1 AP |
297 | mflo ($nlo,$nhi,$m1) |
298 | mfhi ($nhi,$nhi,$m1) | |
09854736 AP |
299 | |
300 | move $tp,$sp | |
301 | li $j,2*$BNSZ | |
302 | $LD $tj,$BNSZ($tp) | |
303 | .align 4 | |
304 | .Linner: | |
305 | .set noreorder | |
306 | $PTR_ADD $aj,$ap,$j | |
307 | $PTR_ADD $nj,$np,$j | |
308 | $LD $aj,($aj) | |
309 | $LD $nj,($nj) | |
310 | ||
947716c1 | 311 | $MULTU ($aj,$bi) |
09854736 AP |
312 | $ADDU $lo0,$alo,$hi0 |
313 | $ADDU $lo1,$nlo,$hi1 | |
314 | sltu $at,$lo0,$hi0 | |
315 | sltu $t0,$lo1,$hi1 | |
316 | $ADDU $hi0,$ahi,$at | |
317 | $ADDU $hi1,$nhi,$t0 | |
947716c1 AP |
318 | mflo ($alo,$aj,$bi) |
319 | mfhi ($ahi,$aj,$bi) | |
09854736 AP |
320 | |
321 | $ADDU $lo0,$tj | |
322 | addu $j,$BNSZ | |
947716c1 | 323 | $MULTU ($nj,$m1) |
09854736 AP |
324 | sltu $at,$lo0,$tj |
325 | $ADDU $lo1,$lo0 | |
326 | $ADDU $hi0,$at | |
327 | sltu $t0,$lo1,$lo0 | |
328 | $LD $tj,2*$BNSZ($tp) | |
329 | $ADDU $hi1,$t0 | |
330 | sltu $at,$j,$num | |
947716c1 AP |
331 | mflo ($nlo,$nj,$m1) |
332 | mfhi ($nhi,$nj,$m1) | |
09854736 AP |
333 | $ST $lo1,($tp) |
334 | bnez $at,.Linner | |
335 | $PTR_ADD $tp,$BNSZ | |
336 | .set reorder | |
337 | ||
338 | $ADDU $lo0,$alo,$hi0 | |
339 | sltu $at,$lo0,$hi0 | |
340 | $ADDU $hi0,$ahi,$at | |
341 | $ADDU $lo0,$tj | |
342 | sltu $t0,$lo0,$tj | |
343 | $ADDU $hi0,$t0 | |
344 | ||
345 | $LD $tj,2*$BNSZ($tp) | |
346 | $ADDU $lo1,$nlo,$hi1 | |
347 | sltu $at,$lo1,$hi1 | |
348 | $ADDU $hi1,$nhi,$at | |
349 | $ADDU $lo1,$lo0 | |
350 | sltu $t0,$lo1,$lo0 | |
351 | $ADDU $hi1,$t0 | |
352 | $ST $lo1,($tp) | |
353 | ||
354 | $ADDU $lo1,$hi1,$hi0 | |
355 | sltu $hi1,$lo1,$hi0 | |
356 | $ADDU $lo1,$tj | |
357 | sltu $at,$lo1,$tj | |
358 | $ADDU $hi1,$at | |
359 | $ST $lo1,$BNSZ($tp) | |
360 | $ST $hi1,2*$BNSZ($tp) | |
361 | ||
362 | addu $i,$BNSZ | |
363 | sltu $t0,$i,$num | |
364 | bnez $t0,.Louter | |
365 | \f | |
366 | .set noreorder | |
367 | $PTR_ADD $tj,$sp,$num # &tp[num] | |
368 | move $tp,$sp | |
369 | move $ap,$sp | |
370 | li $hi0,0 # clear borrow bit | |
371 | ||
372 | .align 4 | |
373 | .Lsub: $LD $lo0,($tp) | |
374 | $LD $lo1,($np) | |
375 | $PTR_ADD $tp,$BNSZ | |
376 | $PTR_ADD $np,$BNSZ | |
377 | $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i] | |
378 | sgtu $at,$lo1,$lo0 | |
379 | $SUBU $lo0,$lo1,$hi0 | |
380 | sgtu $hi0,$lo0,$lo1 | |
381 | $ST $lo0,($rp) | |
382 | or $hi0,$at | |
383 | sltu $at,$tp,$tj | |
384 | bnez $at,.Lsub | |
385 | $PTR_ADD $rp,$BNSZ | |
386 | ||
387 | $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit | |
388 | move $tp,$sp | |
389 | $PTR_SUB $rp,$num # restore rp | |
390 | not $hi1,$hi0 | |
391 | ||
774ff8fe AP |
392 | .Lcopy: $LD $nj,($tp) # conditional move |
393 | $LD $aj,($rp) | |
09854736 AP |
394 | $ST $zero,($tp) |
395 | $PTR_ADD $tp,$BNSZ | |
774ff8fe AP |
396 | and $nj,$hi0 |
397 | and $aj,$hi1 | |
398 | or $aj,$nj | |
09854736 AP |
399 | sltu $at,$tp,$tj |
400 | $ST $aj,($rp) | |
401 | bnez $at,.Lcopy | |
402 | $PTR_ADD $rp,$BNSZ | |
403 | ||
404 | li $a0,1 | |
405 | li $t0,1 | |
406 | ||
407 | .set noreorder | |
408 | move $sp,$fp | |
409 | $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp) | |
410 | $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp) | |
411 | $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp) | |
412 | $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp) | |
413 | $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp) | |
414 | $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp) | |
415 | $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp) | |
416 | $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp) | |
417 | $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp) | |
418 | ___ | |
419 | $code.=<<___ if ($flavour =~ /nubi/i); | |
420 | $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp) | |
421 | $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp) | |
422 | $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp) | |
423 | $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp) | |
424 | ___ | |
425 | $code.=<<___; | |
426 | jr $ra | |
427 | $PTR_ADD $sp,$FRAMESIZE*$SZREG | |
428 | .end bn_mul_mont_internal | |
429 | .rdata | |
430 | .asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>" | |
431 | ___ | |
432 | ||
433 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | |
434 | ||
435 | print $code; | |
a21314db | 436 | close STDOUT or die "error closing STDOUT: $!"; |