]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/ppc-mont.pl
PPC assembly pack: add .size directives.
[thirdparty/openssl.git] / crypto / bn / asm / ppc-mont.pl
CommitLineData
2c5d4daa
AP
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7d9cf7c0
AP
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
2c5d4daa
AP
8# ====================================================================
9
10# April 2006
11
12# "Teaser" Montgomery multiplication module for PowerPC. It's possible
13# to gain a bit more by modulo-scheduling outer loop, then dedicated
14# squaring procedure should give further 20% and code can be adapted
15# for 32-bit application running on 64-bit CPU. As for the latter.
16# It won't be able to achieve "native" 64-bit performance, because in
17# 32-bit application context every addc instruction will have to be
18# expanded as addc, twice right shift by 32 and finally adde, etc.
19# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
20# for 64-bit application running on PPC970/G5 is:
21#
22# 512-bit +65%
23# 1024-bit +35%
24# 2048-bit +18%
25# 4096-bit +4%
26
addd641f 27$flavour = shift;
2c5d4daa 28
addd641f 29if ($flavour =~ /32/) {
2c5d4daa
AP
30 $BITS= 32;
31 $BNSZ= $BITS/8;
32 $SIZE_T=4;
33 $RZONE= 224;
2c5d4daa
AP
34
35 $LD= "lwz"; # load
36 $LDU= "lwzu"; # load and update
37 $LDX= "lwzx"; # load indexed
38 $ST= "stw"; # store
39 $STU= "stwu"; # store and update
40 $STX= "stwx"; # store indexed
41 $STUX= "stwux"; # store indexed and update
42 $UMULL= "mullw"; # unsigned multiply low
43 $UMULH= "mulhwu"; # unsigned multiply high
44 $UCMP= "cmplw"; # unsigned compare
7d9cf7c0 45 $SHRI= "srwi"; # unsigned shift right by immediate
2c5d4daa
AP
46 $PUSH= $ST;
47 $POP= $LD;
addd641f 48} elsif ($flavour =~ /64/) {
2c5d4daa
AP
49 $BITS= 64;
50 $BNSZ= $BITS/8;
51 $SIZE_T=8;
52 $RZONE= 288;
2c5d4daa
AP
53
54 # same as above, but 64-bit mnemonics...
55 $LD= "ld"; # load
56 $LDU= "ldu"; # load and update
57 $LDX= "ldx"; # load indexed
58 $ST= "std"; # store
59 $STU= "stdu"; # store and update
60 $STX= "stdx"; # store indexed
61 $STUX= "stdux"; # store indexed and update
62 $UMULL= "mulld"; # unsigned multiply low
63 $UMULH= "mulhdu"; # unsigned multiply high
64 $UCMP= "cmpld"; # unsigned compare
7d9cf7c0 65 $SHRI= "srdi"; # unsigned shift right by immediate
2c5d4daa
AP
66 $PUSH= $ST;
67 $POP= $LD;
addd641f 68} else { die "nonsense $flavour"; }
2c5d4daa 69
67150340
AP
70$FRAME=8*$SIZE_T+$RZONE;
71$LOCALS=8*$SIZE_T;
72
addd641f
AP
73$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
74( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
75( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
76die "can't locate ppc-xlate.pl";
77
78open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
2c5d4daa
AP
79
80$sp="r1";
81$toc="r2";
82$rp="r3"; $ovf="r3";
83$ap="r4";
84$bp="r5";
85$np="r6";
86$n0="r7";
87$num="r8";
88$rp="r9"; # $rp is reassigned
89$aj="r10";
90$nj="r11";
91$tj="r12";
92# non-volatile registers
67150340
AP
93$i="r20";
94$j="r21";
95$tp="r22";
96$m0="r23";
97$m1="r24";
98$lo0="r25";
99$hi0="r26";
100$lo1="r27";
101$hi1="r28";
102$alo="r29";
103$ahi="r30";
104$nlo="r31";
2c5d4daa
AP
105#
106$nhi="r0";
107
108$code=<<___;
67d99090 109.machine "any"
2c5d4daa
AP
110.text
111
b4b48a10 112.globl .bn_mul_mont_int
2c5d4daa 113.align 4
b4b48a10 114.bn_mul_mont_int:
2c5d4daa
AP
115 cmpwi $num,4
116 mr $rp,r3 ; $rp is reassigned
117 li r3,0
118 bltlr
b4b48a10
AP
119___
120$code.=<<___ if ($BNSZ==4);
121 cmpwi $num,32 ; longer key performance is not better
122 bgelr
123___
124$code.=<<___;
2c5d4daa
AP
125 slwi $num,$num,`log($BNSZ)/log(2)`
126 li $tj,-4096
67150340 127 addi $ovf,$num,$FRAME
2c5d4daa
AP
128 subf $ovf,$ovf,$sp ; $sp-$ovf
129 and $ovf,$ovf,$tj ; minimize TLB usage
130 subf $ovf,$sp,$ovf ; $ovf-$sp
67150340 131 mr $tj,$sp
2c5d4daa
AP
132 srwi $num,$num,`log($BNSZ)/log(2)`
133 $STUX $sp,$sp,$ovf
134
67150340
AP
135 $PUSH r20,`-12*$SIZE_T`($tj)
136 $PUSH r21,`-11*$SIZE_T`($tj)
137 $PUSH r22,`-10*$SIZE_T`($tj)
138 $PUSH r23,`-9*$SIZE_T`($tj)
139 $PUSH r24,`-8*$SIZE_T`($tj)
140 $PUSH r25,`-7*$SIZE_T`($tj)
141 $PUSH r26,`-6*$SIZE_T`($tj)
142 $PUSH r27,`-5*$SIZE_T`($tj)
143 $PUSH r28,`-4*$SIZE_T`($tj)
144 $PUSH r29,`-3*$SIZE_T`($tj)
145 $PUSH r30,`-2*$SIZE_T`($tj)
146 $PUSH r31,`-1*$SIZE_T`($tj)
2c5d4daa
AP
147
148 $LD $n0,0($n0) ; pull n0[0] value
149 addi $num,$num,-2 ; adjust $num for counter register
150\f
151 $LD $m0,0($bp) ; m0=bp[0]
152 $LD $aj,0($ap) ; ap[0]
67150340 153 addi $tp,$sp,$LOCALS
2c5d4daa
AP
154 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
155 $UMULH $hi0,$aj,$m0
156
157 $LD $aj,$BNSZ($ap) ; ap[1]
158 $LD $nj,0($np) ; np[0]
159
160 $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
161
162 $UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
163 $UMULH $ahi,$aj,$m0
164
165 $UMULL $lo1,$nj,$m1 ; np[0]*m1
166 $UMULH $hi1,$nj,$m1
167 $LD $nj,$BNSZ($np) ; np[1]
168 addc $lo1,$lo1,$lo0
169 addze $hi1,$hi1
170
171 $UMULL $nlo,$nj,$m1 ; np[1]*m1
172 $UMULH $nhi,$nj,$m1
173
174 mtctr $num
175 li $j,`2*$BNSZ`
176.align 4
177L1st:
178 $LDX $aj,$ap,$j ; ap[j]
2c5d4daa 179 addc $lo0,$alo,$hi0
8ea975d0 180 $LDX $nj,$np,$j ; np[j]
2c5d4daa
AP
181 addze $hi0,$ahi
182 $UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
2c5d4daa 183 addc $lo1,$nlo,$hi1
8ea975d0 184 $UMULH $ahi,$aj,$m0
2c5d4daa
AP
185 addze $hi1,$nhi
186 $UMULL $nlo,$nj,$m1 ; np[j]*m1
2c5d4daa 187 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
8ea975d0 188 $UMULH $nhi,$nj,$m1
2c5d4daa
AP
189 addze $hi1,$hi1
190 $ST $lo1,0($tp) ; tp[j-1]
191
192 addi $j,$j,$BNSZ ; j++
193 addi $tp,$tp,$BNSZ ; tp++
194 bdnz- L1st
195;L1st
196 addc $lo0,$alo,$hi0
197 addze $hi0,$ahi
198
199 addc $lo1,$nlo,$hi1
200 addze $hi1,$nhi
201 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
202 addze $hi1,$hi1
203 $ST $lo1,0($tp) ; tp[j-1]
204
205 li $ovf,0
206 addc $hi1,$hi1,$hi0
207 addze $ovf,$ovf ; upmost overflow bit
208 $ST $hi1,$BNSZ($tp)
209\f
210 li $i,$BNSZ
211.align 4
212Louter:
213 $LDX $m0,$bp,$i ; m0=bp[i]
214 $LD $aj,0($ap) ; ap[0]
67150340
AP
215 addi $tp,$sp,$LOCALS
216 $LD $tj,$LOCALS($sp); tp[0]
2c5d4daa
AP
217 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
218 $UMULH $hi0,$aj,$m0
219 $LD $aj,$BNSZ($ap) ; ap[1]
220 $LD $nj,0($np) ; np[0]
221 addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
8ea975d0 222 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
2c5d4daa 223 addze $hi0,$hi0
2c5d4daa 224 $UMULL $m1,$lo0,$n0 ; tp[0]*n0
2c5d4daa 225 $UMULH $ahi,$aj,$m0
2c5d4daa
AP
226 $UMULL $lo1,$nj,$m1 ; np[0]*m1
227 $UMULH $hi1,$nj,$m1
228 $LD $nj,$BNSZ($np) ; np[1]
229 addc $lo1,$lo1,$lo0
2c5d4daa 230 $UMULL $nlo,$nj,$m1 ; np[1]*m1
8ea975d0 231 addze $hi1,$hi1
2c5d4daa
AP
232 $UMULH $nhi,$nj,$m1
233\f
234 mtctr $num
235 li $j,`2*$BNSZ`
236.align 4
237Linner:
238 $LDX $aj,$ap,$j ; ap[j]
2c5d4daa 239 addc $lo0,$alo,$hi0
8ea975d0 240 $LD $tj,$BNSZ($tp) ; tp[j]
2c5d4daa
AP
241 addze $hi0,$ahi
242 $LDX $nj,$np,$j ; np[j]
2c5d4daa 243 addc $lo1,$nlo,$hi1
8ea975d0 244 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
2c5d4daa 245 addze $hi1,$nhi
8ea975d0
AP
246 $UMULH $ahi,$aj,$m0
247 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
2c5d4daa 248 $UMULL $nlo,$nj,$m1 ; np[j]*m1
8ea975d0 249 addze $hi0,$hi0
2c5d4daa
AP
250 $UMULH $nhi,$nj,$m1
251 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
8ea975d0 252 addi $j,$j,$BNSZ ; j++
2c5d4daa
AP
253 addze $hi1,$hi1
254 $ST $lo1,0($tp) ; tp[j-1]
2c5d4daa
AP
255 addi $tp,$tp,$BNSZ ; tp++
256 bdnz- Linner
257;Linner
258 $LD $tj,$BNSZ($tp) ; tp[j]
259 addc $lo0,$alo,$hi0
260 addze $hi0,$ahi
261 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
262 addze $hi0,$hi0
263
264 addc $lo1,$nlo,$hi1
265 addze $hi1,$nhi
266 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
267 addze $hi1,$hi1
268 $ST $lo1,0($tp) ; tp[j-1]
269
270 addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
271 li $ovf,0
272 adde $hi1,$hi1,$hi0
273 addze $ovf,$ovf
274 $ST $hi1,$BNSZ($tp)
275;
276 slwi $tj,$num,`log($BNSZ)/log(2)`
277 $UCMP $i,$tj
278 addi $i,$i,$BNSZ
279 ble- Louter
280\f
281 addi $num,$num,2 ; restore $num
7d9cf7c0 282 subfc $j,$j,$j ; j=0 and "clear" XER[CA]
67150340 283 addi $tp,$sp,$LOCALS
2c5d4daa 284 mtctr $num
7d9cf7c0
AP
285
286.align 4
287Lsub: $LDX $tj,$tp,$j
288 $LDX $nj,$np,$j
289 subfe $aj,$nj,$tj ; tp[j]-np[j]
290 $STX $aj,$rp,$j
291 addi $j,$j,$BNSZ
292 bdnz- Lsub
293
2c5d4daa 294 li $j,0
7d9cf7c0
AP
295 mtctr $num
296 subfe $ovf,$j,$ovf ; handle upmost overflow bit
297 and $ap,$tp,$ovf
298 andc $np,$rp,$ovf
299 or $ap,$ap,$np ; ap=borrow?tp:rp
2c5d4daa 300
2c5d4daa 301.align 4
7d9cf7c0
AP
302Lcopy: ; copy or in-place refresh
303 $LDX $tj,$ap,$j
2c5d4daa
AP
304 $STX $tj,$rp,$j
305 $STX $j,$tp,$j ; zap at once
306 addi $j,$j,$BNSZ
307 bdnz- Lcopy
308
67150340 309 $POP $tj,0($sp)
2c5d4daa 310 li r3,1
67150340
AP
311 $POP r20,`-12*$SIZE_T`($tj)
312 $POP r21,`-11*$SIZE_T`($tj)
313 $POP r22,`-10*$SIZE_T`($tj)
314 $POP r23,`-9*$SIZE_T`($tj)
315 $POP r24,`-8*$SIZE_T`($tj)
316 $POP r25,`-7*$SIZE_T`($tj)
317 $POP r26,`-6*$SIZE_T`($tj)
318 $POP r27,`-5*$SIZE_T`($tj)
319 $POP r28,`-4*$SIZE_T`($tj)
320 $POP r29,`-3*$SIZE_T`($tj)
321 $POP r30,`-2*$SIZE_T`($tj)
322 $POP r31,`-1*$SIZE_T`($tj)
323 mr $sp,$tj
2c5d4daa
AP
324 blr
325 .long 0
67150340
AP
326 .byte 0,12,4,0,0x80,12,6,0
327 .long 0
d6019e16 328.size .bn_mul_mont_int,.-.bn_mul_mont_int
67150340
AP
329
330.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
2c5d4daa
AP
331___
332
333$code =~ s/\`([^\`]*)\`/eval $1/gem;
334print $code;
335close STDOUT;