]>
Commit | Line | Data |
---|---|---|
2c5d4daa AP |
1 | #!/usr/bin/env perl |
2 | ||
3 | # ==================================================================== | |
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
7d9cf7c0 AP |
5 | # project. The module is, however, dual licensed under OpenSSL and |
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | # details see http://www.openssl.org/~appro/cryptogams/. | |
2c5d4daa AP |
8 | # ==================================================================== |
9 | ||
10 | # April 2006 | |
11 | ||
12 | # "Teaser" Montgomery multiplication module for PowerPC. It's possible | |
13 | # to gain a bit more by modulo-scheduling outer loop, then dedicated | |
14 | # squaring procedure should give further 20% and code can be adapted | |
15 | # for 32-bit application running on 64-bit CPU. As for the latter. | |
16 | # It won't be able to achieve "native" 64-bit performance, because in | |
17 | # 32-bit application context every addc instruction will have to be | |
18 | # expanded as addc, twice right shift by 32 and finally adde, etc. | |
19 | # So far RSA *sign* performance improvement over pre-bn_mul_mont asm | |
20 | # for 64-bit application running on PPC970/G5 is: | |
21 | # | |
22 | # 512-bit +65% | |
23 | # 1024-bit +35% | |
24 | # 2048-bit +18% | |
25 | # 4096-bit +4% | |
26 | ||
addd641f | 27 | $flavour = shift; |
2c5d4daa | 28 | |
addd641f | 29 | if ($flavour =~ /32/) { |
2c5d4daa AP |
30 | $BITS= 32; |
31 | $BNSZ= $BITS/8; | |
32 | $SIZE_T=4; | |
33 | $RZONE= 224; | |
2c5d4daa AP |
34 | |
35 | $LD= "lwz"; # load | |
36 | $LDU= "lwzu"; # load and update | |
37 | $LDX= "lwzx"; # load indexed | |
38 | $ST= "stw"; # store | |
39 | $STU= "stwu"; # store and update | |
40 | $STX= "stwx"; # store indexed | |
41 | $STUX= "stwux"; # store indexed and update | |
42 | $UMULL= "mullw"; # unsigned multiply low | |
43 | $UMULH= "mulhwu"; # unsigned multiply high | |
44 | $UCMP= "cmplw"; # unsigned compare | |
7d9cf7c0 | 45 | $SHRI= "srwi"; # unsigned shift right by immediate |
2c5d4daa AP |
46 | $PUSH= $ST; |
47 | $POP= $LD; | |
addd641f | 48 | } elsif ($flavour =~ /64/) { |
2c5d4daa AP |
49 | $BITS= 64; |
50 | $BNSZ= $BITS/8; | |
51 | $SIZE_T=8; | |
52 | $RZONE= 288; | |
2c5d4daa AP |
53 | |
54 | # same as above, but 64-bit mnemonics... | |
55 | $LD= "ld"; # load | |
56 | $LDU= "ldu"; # load and update | |
57 | $LDX= "ldx"; # load indexed | |
58 | $ST= "std"; # store | |
59 | $STU= "stdu"; # store and update | |
60 | $STX= "stdx"; # store indexed | |
61 | $STUX= "stdux"; # store indexed and update | |
62 | $UMULL= "mulld"; # unsigned multiply low | |
63 | $UMULH= "mulhdu"; # unsigned multiply high | |
64 | $UCMP= "cmpld"; # unsigned compare | |
7d9cf7c0 | 65 | $SHRI= "srdi"; # unsigned shift right by immediate |
2c5d4daa AP |
66 | $PUSH= $ST; |
67 | $POP= $LD; | |
addd641f | 68 | } else { die "nonsense $flavour"; } |
2c5d4daa | 69 | |
67150340 AP |
70 | $FRAME=8*$SIZE_T+$RZONE; |
71 | $LOCALS=8*$SIZE_T; | |
72 | ||
addd641f AP |
73 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
74 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
75 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | |
76 | die "can't locate ppc-xlate.pl"; | |
77 | ||
78 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | |
2c5d4daa AP |
79 | |
80 | $sp="r1"; | |
81 | $toc="r2"; | |
82 | $rp="r3"; $ovf="r3"; | |
83 | $ap="r4"; | |
84 | $bp="r5"; | |
85 | $np="r6"; | |
86 | $n0="r7"; | |
87 | $num="r8"; | |
88 | $rp="r9"; # $rp is reassigned | |
89 | $aj="r10"; | |
90 | $nj="r11"; | |
91 | $tj="r12"; | |
92 | # non-volatile registers | |
67150340 AP |
93 | $i="r20"; |
94 | $j="r21"; | |
95 | $tp="r22"; | |
96 | $m0="r23"; | |
97 | $m1="r24"; | |
98 | $lo0="r25"; | |
99 | $hi0="r26"; | |
100 | $lo1="r27"; | |
101 | $hi1="r28"; | |
102 | $alo="r29"; | |
103 | $ahi="r30"; | |
104 | $nlo="r31"; | |
2c5d4daa AP |
105 | # |
106 | $nhi="r0"; | |
107 | ||
108 | $code=<<___; | |
67d99090 | 109 | .machine "any" |
2c5d4daa AP |
110 | .text |
111 | ||
b4b48a10 | 112 | .globl .bn_mul_mont_int |
2c5d4daa | 113 | .align 4 |
b4b48a10 | 114 | .bn_mul_mont_int: |
2c5d4daa AP |
115 | cmpwi $num,4 |
116 | mr $rp,r3 ; $rp is reassigned | |
117 | li r3,0 | |
118 | bltlr | |
b4b48a10 AP |
119 | ___ |
120 | $code.=<<___ if ($BNSZ==4); | |
121 | cmpwi $num,32 ; longer key performance is not better | |
122 | bgelr | |
123 | ___ | |
124 | $code.=<<___; | |
2c5d4daa AP |
125 | slwi $num,$num,`log($BNSZ)/log(2)` |
126 | li $tj,-4096 | |
67150340 | 127 | addi $ovf,$num,$FRAME |
2c5d4daa AP |
128 | subf $ovf,$ovf,$sp ; $sp-$ovf |
129 | and $ovf,$ovf,$tj ; minimize TLB usage | |
130 | subf $ovf,$sp,$ovf ; $ovf-$sp | |
67150340 | 131 | mr $tj,$sp |
2c5d4daa AP |
132 | srwi $num,$num,`log($BNSZ)/log(2)` |
133 | $STUX $sp,$sp,$ovf | |
134 | ||
67150340 AP |
135 | $PUSH r20,`-12*$SIZE_T`($tj) |
136 | $PUSH r21,`-11*$SIZE_T`($tj) | |
137 | $PUSH r22,`-10*$SIZE_T`($tj) | |
138 | $PUSH r23,`-9*$SIZE_T`($tj) | |
139 | $PUSH r24,`-8*$SIZE_T`($tj) | |
140 | $PUSH r25,`-7*$SIZE_T`($tj) | |
141 | $PUSH r26,`-6*$SIZE_T`($tj) | |
142 | $PUSH r27,`-5*$SIZE_T`($tj) | |
143 | $PUSH r28,`-4*$SIZE_T`($tj) | |
144 | $PUSH r29,`-3*$SIZE_T`($tj) | |
145 | $PUSH r30,`-2*$SIZE_T`($tj) | |
146 | $PUSH r31,`-1*$SIZE_T`($tj) | |
2c5d4daa AP |
147 | |
148 | $LD $n0,0($n0) ; pull n0[0] value | |
149 | addi $num,$num,-2 ; adjust $num for counter register | |
150 | \f | |
151 | $LD $m0,0($bp) ; m0=bp[0] | |
152 | $LD $aj,0($ap) ; ap[0] | |
67150340 | 153 | addi $tp,$sp,$LOCALS |
2c5d4daa AP |
154 | $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] |
155 | $UMULH $hi0,$aj,$m0 | |
156 | ||
157 | $LD $aj,$BNSZ($ap) ; ap[1] | |
158 | $LD $nj,0($np) ; np[0] | |
159 | ||
160 | $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0 | |
161 | ||
162 | $UMULL $alo,$aj,$m0 ; ap[1]*bp[0] | |
163 | $UMULH $ahi,$aj,$m0 | |
164 | ||
165 | $UMULL $lo1,$nj,$m1 ; np[0]*m1 | |
166 | $UMULH $hi1,$nj,$m1 | |
167 | $LD $nj,$BNSZ($np) ; np[1] | |
168 | addc $lo1,$lo1,$lo0 | |
169 | addze $hi1,$hi1 | |
170 | ||
171 | $UMULL $nlo,$nj,$m1 ; np[1]*m1 | |
172 | $UMULH $nhi,$nj,$m1 | |
173 | ||
174 | mtctr $num | |
175 | li $j,`2*$BNSZ` | |
176 | .align 4 | |
177 | L1st: | |
178 | $LDX $aj,$ap,$j ; ap[j] | |
2c5d4daa | 179 | addc $lo0,$alo,$hi0 |
8ea975d0 | 180 | $LDX $nj,$np,$j ; np[j] |
2c5d4daa AP |
181 | addze $hi0,$ahi |
182 | $UMULL $alo,$aj,$m0 ; ap[j]*bp[0] | |
2c5d4daa | 183 | addc $lo1,$nlo,$hi1 |
8ea975d0 | 184 | $UMULH $ahi,$aj,$m0 |
2c5d4daa AP |
185 | addze $hi1,$nhi |
186 | $UMULL $nlo,$nj,$m1 ; np[j]*m1 | |
2c5d4daa | 187 | addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] |
8ea975d0 | 188 | $UMULH $nhi,$nj,$m1 |
2c5d4daa AP |
189 | addze $hi1,$hi1 |
190 | $ST $lo1,0($tp) ; tp[j-1] | |
191 | ||
192 | addi $j,$j,$BNSZ ; j++ | |
193 | addi $tp,$tp,$BNSZ ; tp++ | |
194 | bdnz- L1st | |
195 | ;L1st | |
196 | addc $lo0,$alo,$hi0 | |
197 | addze $hi0,$ahi | |
198 | ||
199 | addc $lo1,$nlo,$hi1 | |
200 | addze $hi1,$nhi | |
201 | addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] | |
202 | addze $hi1,$hi1 | |
203 | $ST $lo1,0($tp) ; tp[j-1] | |
204 | ||
205 | li $ovf,0 | |
206 | addc $hi1,$hi1,$hi0 | |
207 | addze $ovf,$ovf ; upmost overflow bit | |
208 | $ST $hi1,$BNSZ($tp) | |
209 | \f | |
210 | li $i,$BNSZ | |
211 | .align 4 | |
212 | Louter: | |
213 | $LDX $m0,$bp,$i ; m0=bp[i] | |
214 | $LD $aj,0($ap) ; ap[0] | |
67150340 AP |
215 | addi $tp,$sp,$LOCALS |
216 | $LD $tj,$LOCALS($sp); tp[0] | |
2c5d4daa AP |
217 | $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] |
218 | $UMULH $hi0,$aj,$m0 | |
219 | $LD $aj,$BNSZ($ap) ; ap[1] | |
220 | $LD $nj,0($np) ; np[0] | |
221 | addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0] | |
8ea975d0 | 222 | $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] |
2c5d4daa | 223 | addze $hi0,$hi0 |
2c5d4daa | 224 | $UMULL $m1,$lo0,$n0 ; tp[0]*n0 |
2c5d4daa | 225 | $UMULH $ahi,$aj,$m0 |
2c5d4daa AP |
226 | $UMULL $lo1,$nj,$m1 ; np[0]*m1 |
227 | $UMULH $hi1,$nj,$m1 | |
228 | $LD $nj,$BNSZ($np) ; np[1] | |
229 | addc $lo1,$lo1,$lo0 | |
2c5d4daa | 230 | $UMULL $nlo,$nj,$m1 ; np[1]*m1 |
8ea975d0 | 231 | addze $hi1,$hi1 |
2c5d4daa AP |
232 | $UMULH $nhi,$nj,$m1 |
233 | \f | |
234 | mtctr $num | |
235 | li $j,`2*$BNSZ` | |
236 | .align 4 | |
237 | Linner: | |
238 | $LDX $aj,$ap,$j ; ap[j] | |
2c5d4daa | 239 | addc $lo0,$alo,$hi0 |
8ea975d0 | 240 | $LD $tj,$BNSZ($tp) ; tp[j] |
2c5d4daa AP |
241 | addze $hi0,$ahi |
242 | $LDX $nj,$np,$j ; np[j] | |
2c5d4daa | 243 | addc $lo1,$nlo,$hi1 |
8ea975d0 | 244 | $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] |
2c5d4daa | 245 | addze $hi1,$nhi |
8ea975d0 AP |
246 | $UMULH $ahi,$aj,$m0 |
247 | addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] | |
2c5d4daa | 248 | $UMULL $nlo,$nj,$m1 ; np[j]*m1 |
8ea975d0 | 249 | addze $hi0,$hi0 |
2c5d4daa AP |
250 | $UMULH $nhi,$nj,$m1 |
251 | addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] | |
8ea975d0 | 252 | addi $j,$j,$BNSZ ; j++ |
2c5d4daa AP |
253 | addze $hi1,$hi1 |
254 | $ST $lo1,0($tp) ; tp[j-1] | |
2c5d4daa AP |
255 | addi $tp,$tp,$BNSZ ; tp++ |
256 | bdnz- Linner | |
257 | ;Linner | |
258 | $LD $tj,$BNSZ($tp) ; tp[j] | |
259 | addc $lo0,$alo,$hi0 | |
260 | addze $hi0,$ahi | |
261 | addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] | |
262 | addze $hi0,$hi0 | |
263 | ||
264 | addc $lo1,$nlo,$hi1 | |
265 | addze $hi1,$nhi | |
266 | addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] | |
267 | addze $hi1,$hi1 | |
268 | $ST $lo1,0($tp) ; tp[j-1] | |
269 | ||
270 | addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA] | |
271 | li $ovf,0 | |
272 | adde $hi1,$hi1,$hi0 | |
273 | addze $ovf,$ovf | |
274 | $ST $hi1,$BNSZ($tp) | |
275 | ; | |
276 | slwi $tj,$num,`log($BNSZ)/log(2)` | |
277 | $UCMP $i,$tj | |
278 | addi $i,$i,$BNSZ | |
279 | ble- Louter | |
280 | \f | |
281 | addi $num,$num,2 ; restore $num | |
7d9cf7c0 | 282 | subfc $j,$j,$j ; j=0 and "clear" XER[CA] |
67150340 | 283 | addi $tp,$sp,$LOCALS |
2c5d4daa | 284 | mtctr $num |
7d9cf7c0 AP |
285 | |
286 | .align 4 | |
287 | Lsub: $LDX $tj,$tp,$j | |
288 | $LDX $nj,$np,$j | |
289 | subfe $aj,$nj,$tj ; tp[j]-np[j] | |
290 | $STX $aj,$rp,$j | |
291 | addi $j,$j,$BNSZ | |
292 | bdnz- Lsub | |
293 | ||
2c5d4daa | 294 | li $j,0 |
7d9cf7c0 AP |
295 | mtctr $num |
296 | subfe $ovf,$j,$ovf ; handle upmost overflow bit | |
297 | and $ap,$tp,$ovf | |
298 | andc $np,$rp,$ovf | |
299 | or $ap,$ap,$np ; ap=borrow?tp:rp | |
2c5d4daa | 300 | |
2c5d4daa | 301 | .align 4 |
7d9cf7c0 AP |
302 | Lcopy: ; copy or in-place refresh |
303 | $LDX $tj,$ap,$j | |
2c5d4daa AP |
304 | $STX $tj,$rp,$j |
305 | $STX $j,$tp,$j ; zap at once | |
306 | addi $j,$j,$BNSZ | |
307 | bdnz- Lcopy | |
308 | ||
67150340 | 309 | $POP $tj,0($sp) |
2c5d4daa | 310 | li r3,1 |
67150340 AP |
311 | $POP r20,`-12*$SIZE_T`($tj) |
312 | $POP r21,`-11*$SIZE_T`($tj) | |
313 | $POP r22,`-10*$SIZE_T`($tj) | |
314 | $POP r23,`-9*$SIZE_T`($tj) | |
315 | $POP r24,`-8*$SIZE_T`($tj) | |
316 | $POP r25,`-7*$SIZE_T`($tj) | |
317 | $POP r26,`-6*$SIZE_T`($tj) | |
318 | $POP r27,`-5*$SIZE_T`($tj) | |
319 | $POP r28,`-4*$SIZE_T`($tj) | |
320 | $POP r29,`-3*$SIZE_T`($tj) | |
321 | $POP r30,`-2*$SIZE_T`($tj) | |
322 | $POP r31,`-1*$SIZE_T`($tj) | |
323 | mr $sp,$tj | |
2c5d4daa AP |
324 | blr |
325 | .long 0 | |
67150340 AP |
326 | .byte 0,12,4,0,0x80,12,6,0 |
327 | .long 0 | |
d6019e16 | 328 | .size .bn_mul_mont_int,.-.bn_mul_mont_int |
67150340 AP |
329 | |
330 | .asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>" | |
2c5d4daa AP |
331 | ___ |
332 | ||
333 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | |
334 | print $code; | |
335 | close STDOUT; |