]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/sha1-ppc.pl
Minor PPC assembler updates.
[thirdparty/openssl.git] / crypto / sha / asm / sha1-ppc.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
8
9 # I let hardware handle unaligned input, except on page boundaries
10 # (see below for details). Otherwise straightforward implementation
11 # with X vector in register bank. The module is big-endian [which is
12 # not big deal as there're no little-endian targets left around].
13
14 # gcc-4.0.0 -m64 -m32
15 # --------------------------
16 # sha1 +76% +59%
17
18 $output = shift;
19
20 if ($output =~ /64\.s/) {
21 $SIZE_T =8;
22 $RZONE =288;
23 $UCMP ="cmpld";
24 $STU ="stdu";
25 $POP ="ld";
26 $PUSH ="std";
27 } elsif ($output =~ /32\.s/) {
28 $SIZE_T =4;
29 $RZONE =224;
30 $UCMP ="cmplw";
31 $STU ="stwu";
32 $POP ="lwz";
33 $PUSH ="stw";
34 } else { die "nonsense $output"; }
35
36 ( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
37 die "can't call ../perlasm/ppc-xlate.pl: $!";
38
39 $FRAME=24*$SIZE_T;
40
41 $K ="r0";
42 $sp ="r1";
43 $toc="r2";
44 $ctx="r3";
45 $inp="r4";
46 $num="r5";
47 $t0 ="r15";
48 $t1 ="r6";
49
50 $A ="r7";
51 $B ="r8";
52 $C ="r9";
53 $D ="r10";
54 $E ="r11";
55 $T ="r12";
56
57 @V=($A,$B,$C,$D,$E,$T);
58 @X=("r16","r17","r18","r19","r20","r21","r22","r23",
59 "r24","r25","r26","r27","r28","r29","r30","r31");
60
61 sub BODY_00_19 {
62 my ($i,$a,$b,$c,$d,$e,$f)=@_;
63 my $j=$i+1;
64 $code.=<<___ if ($i==0);
65 lwz @X[$i],`$i*4`($inp)
66 ___
67 $code.=<<___ if ($i<15);
68 lwz @X[$j],`$j*4`($inp)
69 add $f,$K,$e
70 rotlwi $e,$a,5
71 add $f,$f,@X[$i]
72 and $t0,$c,$b
73 add $f,$f,$e
74 andc $t1,$d,$b
75 rotlwi $b,$b,30
76 or $t0,$t0,$t1
77 add $f,$f,$t0
78 ___
79 $code.=<<___ if ($i>=15);
80 add $f,$K,$e
81 rotlwi $e,$a,5
82 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
83 add $f,$f,@X[$i%16]
84 and $t0,$c,$b
85 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
86 add $f,$f,$e
87 andc $t1,$d,$b
88 rotlwi $b,$b,30
89 or $t0,$t0,$t1
90 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
91 add $f,$f,$t0
92 rotlwi @X[$j%16],@X[$j%16],1
93 ___
94 }
95
96 sub BODY_20_39 {
97 my ($i,$a,$b,$c,$d,$e,$f)=@_;
98 my $j=$i+1;
99 $code.=<<___ if ($i<79);
100 add $f,$K,$e
101 rotlwi $e,$a,5
102 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
103 add $f,$f,@X[$i%16]
104 xor $t0,$b,$c
105 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
106 add $f,$f,$e
107 rotlwi $b,$b,30
108 xor $t0,$t0,$d
109 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
110 add $f,$f,$t0
111 rotlwi @X[$j%16],@X[$j%16],1
112 ___
113 $code.=<<___ if ($i==79);
114 add $f,$K,$e
115 rotlwi $e,$a,5
116 lwz r16,0($ctx)
117 add $f,$f,@X[$i%16]
118 xor $t0,$b,$c
119 lwz r17,4($ctx)
120 add $f,$f,$e
121 rotlwi $b,$b,30
122 lwz r18,8($ctx)
123 xor $t0,$t0,$d
124 lwz r19,12($ctx)
125 add $f,$f,$t0
126 lwz r20,16($ctx)
127 ___
128 }
129
130 sub BODY_40_59 {
131 my ($i,$a,$b,$c,$d,$e,$f)=@_;
132 my $j=$i+1;
133 $code.=<<___;
134 add $f,$K,$e
135 rotlwi $e,$a,5
136 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
137 add $f,$f,@X[$i%16]
138 and $t0,$b,$c
139 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
140 add $f,$f,$e
141 or $t1,$b,$c
142 rotlwi $b,$b,30
143 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
144 and $t1,$t1,$d
145 or $t0,$t0,$t1
146 rotlwi @X[$j%16],@X[$j%16],1
147 add $f,$f,$t0
148 ___
149 }
150
151 $code=<<___;
152 .machine any
153 .text
154
155 .globl .sha1_block_asm_data_order
156 .align 4
157 .sha1_block_asm_data_order:
158 mflr r0
159 $STU $sp,`-($FRAME+64+$RZONE)`($sp)
160 $PUSH r0,`$FRAME-$SIZE_T*18`($sp)
161 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
162 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
163 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
164 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
165 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
166 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
167 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
168 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
169 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
170 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
171 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
172 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
173 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
174 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
175 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
176 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
177 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
178 lwz $A,0($ctx)
179 lwz $B,4($ctx)
180 lwz $C,8($ctx)
181 lwz $D,12($ctx)
182 lwz $E,16($ctx)
183 andi. r0,$inp,3
184 bne Lunaligned
185 Laligned:
186 mtctr $num
187 bl Lsha1_block_private
188 Ldone:
189 $POP r0,`$FRAME-$SIZE_T*18`($sp)
190 $POP r15,`$FRAME-$SIZE_T*17`($sp)
191 $POP r16,`$FRAME-$SIZE_T*16`($sp)
192 $POP r17,`$FRAME-$SIZE_T*15`($sp)
193 $POP r18,`$FRAME-$SIZE_T*14`($sp)
194 $POP r19,`$FRAME-$SIZE_T*13`($sp)
195 $POP r20,`$FRAME-$SIZE_T*12`($sp)
196 $POP r21,`$FRAME-$SIZE_T*11`($sp)
197 $POP r22,`$FRAME-$SIZE_T*10`($sp)
198 $POP r23,`$FRAME-$SIZE_T*9`($sp)
199 $POP r24,`$FRAME-$SIZE_T*8`($sp)
200 $POP r25,`$FRAME-$SIZE_T*7`($sp)
201 $POP r26,`$FRAME-$SIZE_T*6`($sp)
202 $POP r27,`$FRAME-$SIZE_T*5`($sp)
203 $POP r28,`$FRAME-$SIZE_T*4`($sp)
204 $POP r29,`$FRAME-$SIZE_T*3`($sp)
205 $POP r30,`$FRAME-$SIZE_T*2`($sp)
206 $POP r31,`$FRAME-$SIZE_T*1`($sp)
207 mtlr r0
208 addi $sp,$sp,`$FRAME+64+$RZONE`
209 blr
210 ___
211
212 # PowerPC specification allows an implementation to be ill-behaved
213 # upon unaligned access which crosses page boundary. "Better safe
214 # than sorry" principle makes me treat it specially. But I don't
215 # look for particular offending word, but rather for 64-byte input
216 # block which crosses the boundary. Once found that block is aligned
217 # and hashed separately...
218 $code.=<<___;
219 .align 4
220 Lunaligned:
221 li $t1,4096
222 subf $t1,$inp,$t1
223 andi. $t1,$t1,4095 ; distance to closest page boundary
224 srwi. $t1,$t1,6 ; t1/=64
225 beq Lcross_page
226 $UCMP $num,$t1
227 ble- Laligned ; didn't cross the page boundary
228 mtctr $t1
229 subf $num,$t1,$num
230 bl Lsha1_block_private
231 Lcross_page:
232 li $t1,16
233 mtctr $t1
234 addi r20,$sp,$FRAME ; spot below the frame
235 Lmemcpy:
236 lbz r16,0($inp)
237 lbz r17,1($inp)
238 lbz r18,2($inp)
239 lbz r19,3($inp)
240 addi $inp,$inp,4
241 stb r16,0(r20)
242 stb r17,1(r20)
243 stb r18,2(r20)
244 stb r19,3(r20)
245 addi r20,r20,4
246 bdnz Lmemcpy
247
248 $PUSH $inp,`$FRAME-$SIZE_T*19`($sp)
249 li $t1,1
250 addi $inp,$sp,$FRAME
251 mtctr $t1
252 bl Lsha1_block_private
253 $POP $inp,`$FRAME-$SIZE_T*19`($sp)
254 addic. $num,$num,-1
255 bne- Lunaligned
256 b Ldone
257 ___
258
259 # This is private block function, which uses tailored calling
260 # interface, namely upon entry SHA_CTX is pre-loaded to given
261 # registers and counter register contains amount of chunks to
262 # digest...
263 $code.=<<___;
264 .align 4
265 Lsha1_block_private:
266 ___
267 $code.=<<___; # load K_00_19
268 lis $K,0x5a82
269 ori $K,$K,0x7999
270 ___
271 for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
272 $code.=<<___; # load K_20_39
273 lis $K,0x6ed9
274 ori $K,$K,0xeba1
275 ___
276 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
277 $code.=<<___; # load K_40_59
278 lis $K,0x8f1b
279 ori $K,$K,0xbcdc
280 ___
281 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
282 $code.=<<___; # load K_60_79
283 lis $K,0xca62
284 ori $K,$K,0xc1d6
285 ___
286 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
287 $code.=<<___;
288 add r16,r16,$E
289 add r17,r17,$T
290 add r18,r18,$A
291 add r19,r19,$B
292 add r20,r20,$C
293 stw r16,0($ctx)
294 mr $A,r16
295 stw r17,4($ctx)
296 mr $B,r17
297 stw r18,8($ctx)
298 mr $C,r18
299 stw r19,12($ctx)
300 mr $D,r19
301 stw r20,16($ctx)
302 mr $E,r20
303 addi $inp,$inp,`16*4`
304 bdnz- Lsha1_block_private
305 blr
306 ___
307
308 $code =~ s/\`([^\`]*)\`/eval $1/gem;
309 print $code;
310 close STDOUT;