]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/sha256-armv4.pl
Add assembly support for 32-bit iOS.
[thirdparty/openssl.git] / crypto / sha / asm / sha256-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 #
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
11
12 # SHA256 block procedure for ARMv4. May 2007.
13
14 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
15 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16 # byte [on single-issue Xscale PXA250 core].
17
18 # July 2010.
19 #
20 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
21 # Cortex A8 core and ~20 cycles per processed byte.
22
23 # February 2011.
24 #
25 # Profiler-assisted and platform-specific optimization resulted in 16%
26 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
27
28 # September 2013.
29 #
30 # Add NEON implementation. On Cortex A8 it was measured to process one
31 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33 # code (meaning that latter performs sub-optimally, nothing was done
34 # about it).
35
36 # May 2014.
37 #
38 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
39
40 $flavour = shift;
41 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
42 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
43
44 if ($flavour && $flavour ne "void") {
45 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
47 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
48 die "can't locate arm-xlate.pl";
49
50 open STDOUT,"| \"$^X\" $xlate $flavour $output";
51 } else {
52 open STDOUT,">$output";
53 }
54
55 $ctx="r0"; $t0="r0";
56 $inp="r1"; $t4="r1";
57 $len="r2"; $t1="r2";
58 $T1="r3"; $t3="r3";
59 $A="r4";
60 $B="r5";
61 $C="r6";
62 $D="r7";
63 $E="r8";
64 $F="r9";
65 $G="r10";
66 $H="r11";
67 @V=($A,$B,$C,$D,$E,$F,$G,$H);
68 $t2="r12";
69 $Ktbl="r14";
70
71 @Sigma0=( 2,13,22);
72 @Sigma1=( 6,11,25);
73 @sigma0=( 7,18, 3);
74 @sigma1=(17,19,10);
75
76 sub BODY_00_15 {
77 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
78
79 $code.=<<___ if ($i<16);
80 #if __ARM_ARCH__>=7
81 @ ldr $t1,[$inp],#4 @ $i
82 # if $i==15
83 str $inp,[sp,#17*4] @ make room for $t4
84 # endif
85 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
86 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
87 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
88 # ifndef __ARMEB__
89 rev $t1,$t1
90 # endif
91 #else
92 @ ldrb $t1,[$inp,#3] @ $i
93 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
94 ldrb $t2,[$inp,#2]
95 ldrb $t0,[$inp,#1]
96 orr $t1,$t1,$t2,lsl#8
97 ldrb $t2,[$inp],#4
98 orr $t1,$t1,$t0,lsl#16
99 # if $i==15
100 str $inp,[sp,#17*4] @ make room for $t4
101 # endif
102 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
103 orr $t1,$t1,$t2,lsl#24
104 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
105 #endif
106 ___
107 $code.=<<___;
108 ldr $t2,[$Ktbl],#4 @ *K256++
109 add $h,$h,$t1 @ h+=X[i]
110 str $t1,[sp,#`$i%16`*4]
111 eor $t1,$f,$g
112 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
113 and $t1,$t1,$e
114 add $h,$h,$t2 @ h+=K256[i]
115 eor $t1,$t1,$g @ Ch(e,f,g)
116 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
117 add $h,$h,$t1 @ h+=Ch(e,f,g)
118 #if $i==31
119 and $t2,$t2,#0xff
120 cmp $t2,#0xf2 @ done?
121 #endif
122 #if $i<15
123 # if __ARM_ARCH__>=7
124 ldr $t1,[$inp],#4 @ prefetch
125 # else
126 ldrb $t1,[$inp,#3]
127 # endif
128 eor $t2,$a,$b @ a^b, b^c in next round
129 #else
130 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
131 eor $t2,$a,$b @ a^b, b^c in next round
132 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
133 #endif
134 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
135 and $t3,$t3,$t2 @ (b^c)&=(a^b)
136 add $d,$d,$h @ d+=h
137 eor $t3,$t3,$b @ Maj(a,b,c)
138 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
139 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
140 ___
141 ($t2,$t3)=($t3,$t2);
142 }
143
144 sub BODY_16_XX {
145 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
146
147 $code.=<<___;
148 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
149 @ ldr $t4,[sp,#`($i+14)%16`*4]
150 mov $t0,$t1,ror#$sigma0[0]
151 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
152 mov $t2,$t4,ror#$sigma1[0]
153 eor $t0,$t0,$t1,ror#$sigma0[1]
154 eor $t2,$t2,$t4,ror#$sigma1[1]
155 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
156 ldr $t1,[sp,#`($i+0)%16`*4]
157 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
158 ldr $t4,[sp,#`($i+9)%16`*4]
159
160 add $t2,$t2,$t0
161 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
162 add $t1,$t1,$t2
163 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
164 add $t1,$t1,$t4 @ X[i]
165 ___
166 &BODY_00_15(@_);
167 }
168
169 $code=<<___;
170 #ifndef __KERNEL__
171 # include "arm_arch.h"
172 #else
173 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
174 # define __ARM_MAX_ARCH__ 7
175 #endif
176
177 .text
178 #if __ARM_ARCH__<7
179 .code 32
180 #else
181 .syntax unified
182 # if defined(__thumb2__) && !defined(__APPLE__)
183 # define adrl adr
184 .thumb
185 # else
186 .code 32
187 # endif
188 #endif
189
190 .type K256,%object
191 .align 5
192 K256:
193 .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
194 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
195 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
196 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
197 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
198 .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
199 .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
200 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
201 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
202 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
203 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
204 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
205 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
206 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
207 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
208 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
209 .size K256,.-K256
210 .word 0 @ terminator
211 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
212 .LOPENSSL_armcap:
213 .word OPENSSL_armcap_P-.Lsha256_block_data_order
214 #endif
215 .align 5
216
217 .global sha256_block_data_order
218 .type sha256_block_data_order,%function
219 sha256_block_data_order:
220 .Lsha256_block_data_order:
221 #if __ARM_ARCH__<7
222 sub r3,pc,#8 @ sha256_block_data_order
223 #else
224 adr r3,sha256_block_data_order
225 #endif
226 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
227 ldr r12,.LOPENSSL_armcap
228 ldr r12,[r3,r12] @ OPENSSL_armcap_P
229 #ifdef __APPLE__
230 ldr r12,[r12]
231 #endif
232 tst r12,#ARMV8_SHA256
233 bne .LARMv8
234 tst r12,#ARMV7_NEON
235 bne .LNEON
236 #endif
237 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
238 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
239 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
240 sub $Ktbl,r3,#256+32 @ K256
241 sub sp,sp,#16*4 @ alloca(X[16])
242 .Loop:
243 # if __ARM_ARCH__>=7
244 ldr $t1,[$inp],#4
245 # else
246 ldrb $t1,[$inp,#3]
247 # endif
248 eor $t3,$B,$C @ magic
249 eor $t2,$t2,$t2
250 ___
251 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
252 $code.=".Lrounds_16_xx:\n";
253 for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
254 $code.=<<___;
255 #if __ARM_ARCH__>=7
256 ite eq @ Thumb2 thing, sanity check in ARM
257 #endif
258 ldreq $t3,[sp,#16*4] @ pull ctx
259 bne .Lrounds_16_xx
260
261 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
262 ldr $t0,[$t3,#0]
263 ldr $t1,[$t3,#4]
264 ldr $t2,[$t3,#8]
265 add $A,$A,$t0
266 ldr $t0,[$t3,#12]
267 add $B,$B,$t1
268 ldr $t1,[$t3,#16]
269 add $C,$C,$t2
270 ldr $t2,[$t3,#20]
271 add $D,$D,$t0
272 ldr $t0,[$t3,#24]
273 add $E,$E,$t1
274 ldr $t1,[$t3,#28]
275 add $F,$F,$t2
276 ldr $inp,[sp,#17*4] @ pull inp
277 ldr $t2,[sp,#18*4] @ pull inp+len
278 add $G,$G,$t0
279 add $H,$H,$t1
280 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
281 cmp $inp,$t2
282 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
283 bne .Loop
284
285 add sp,sp,#`16+3`*4 @ destroy frame
286 #if __ARM_ARCH__>=5
287 ldmia sp!,{r4-r11,pc}
288 #else
289 ldmia sp!,{r4-r11,lr}
290 tst lr,#1
291 moveq pc,lr @ be binary compatible with V4, yet
292 bx lr @ interoperable with Thumb ISA:-)
293 #endif
294 .size sha256_block_data_order,.-sha256_block_data_order
295 ___
296 ######################################################################
297 # NEON stuff
298 #
299 {{{
300 my @X=map("q$_",(0..3));
301 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
302 my $Xfer=$t4;
303 my $j=0;
304
305 sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
306 sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
307
308 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
309 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
310 my $arg = pop;
311 $arg = "#$arg" if ($arg*1 eq $arg);
312 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
313 }
314
315 sub Xupdate()
316 { use integer;
317 my $body = shift;
318 my @insns = (&$body,&$body,&$body,&$body);
319 my ($a,$b,$c,$d,$e,$f,$g,$h);
320
321 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
322 eval(shift(@insns));
323 eval(shift(@insns));
324 eval(shift(@insns));
325 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
326 eval(shift(@insns));
327 eval(shift(@insns));
328 eval(shift(@insns));
329 &vshr_u32 ($T2,$T0,$sigma0[0]);
330 eval(shift(@insns));
331 eval(shift(@insns));
332 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
333 eval(shift(@insns));
334 eval(shift(@insns));
335 &vshr_u32 ($T1,$T0,$sigma0[2]);
336 eval(shift(@insns));
337 eval(shift(@insns));
338 &vsli_32 ($T2,$T0,32-$sigma0[0]);
339 eval(shift(@insns));
340 eval(shift(@insns));
341 &vshr_u32 ($T3,$T0,$sigma0[1]);
342 eval(shift(@insns));
343 eval(shift(@insns));
344 &veor ($T1,$T1,$T2);
345 eval(shift(@insns));
346 eval(shift(@insns));
347 &vsli_32 ($T3,$T0,32-$sigma0[1]);
348 eval(shift(@insns));
349 eval(shift(@insns));
350 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
351 eval(shift(@insns));
352 eval(shift(@insns));
353 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
354 eval(shift(@insns));
355 eval(shift(@insns));
356 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
357 eval(shift(@insns));
358 eval(shift(@insns));
359 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
360 eval(shift(@insns));
361 eval(shift(@insns));
362 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
363 eval(shift(@insns));
364 eval(shift(@insns));
365 &veor ($T5,$T5,$T4);
366 eval(shift(@insns));
367 eval(shift(@insns));
368 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
369 eval(shift(@insns));
370 eval(shift(@insns));
371 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
372 eval(shift(@insns));
373 eval(shift(@insns));
374 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
375 eval(shift(@insns));
376 eval(shift(@insns));
377 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
378 eval(shift(@insns));
379 eval(shift(@insns));
380 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
381 eval(shift(@insns));
382 eval(shift(@insns));
383 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
384 eval(shift(@insns));
385 eval(shift(@insns));
386 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
387 eval(shift(@insns));
388 eval(shift(@insns));
389 &veor ($T5,$T5,$T4);
390 eval(shift(@insns));
391 eval(shift(@insns));
392 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
393 eval(shift(@insns));
394 eval(shift(@insns));
395 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
396 eval(shift(@insns));
397 eval(shift(@insns));
398 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
399 eval(shift(@insns));
400 eval(shift(@insns));
401 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
402 eval(shift(@insns));
403 eval(shift(@insns));
404 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
405 eval(shift(@insns));
406 eval(shift(@insns));
407 &vadd_i32 ($T0,$T0,@X[0]);
408 while($#insns>=2) { eval(shift(@insns)); }
409 &vst1_32 ("{$T0}","[$Xfer,:128]!");
410 eval(shift(@insns));
411 eval(shift(@insns));
412
413 push(@X,shift(@X)); # "rotate" X[]
414 }
415
416 sub Xpreload()
417 { use integer;
418 my $body = shift;
419 my @insns = (&$body,&$body,&$body,&$body);
420 my ($a,$b,$c,$d,$e,$f,$g,$h);
421
422 eval(shift(@insns));
423 eval(shift(@insns));
424 eval(shift(@insns));
425 eval(shift(@insns));
426 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
427 eval(shift(@insns));
428 eval(shift(@insns));
429 eval(shift(@insns));
430 eval(shift(@insns));
431 &vrev32_8 (@X[0],@X[0]);
432 eval(shift(@insns));
433 eval(shift(@insns));
434 eval(shift(@insns));
435 eval(shift(@insns));
436 &vadd_i32 ($T0,$T0,@X[0]);
437 foreach (@insns) { eval; } # remaining instructions
438 &vst1_32 ("{$T0}","[$Xfer,:128]!");
439
440 push(@X,shift(@X)); # "rotate" X[]
441 }
442
443 sub body_00_15 () {
444 (
445 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
446 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
447 '&eor ($t1,$f,$g)',
448 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
449 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
450 '&and ($t1,$t1,$e)',
451 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
452 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
453 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
454 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
455 '&eor ($t2,$a,$b)', # a^b, b^c in next round
456 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
457 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
458 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
459 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
460 '&ldr ($t1,"[sp,#64]") if ($j==31)',
461 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
462 '&add ($d,$d,$h)', # d+=h
463 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
464 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
465 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
466 )
467 }
468
469 $code.=<<___;
470 #if __ARM_MAX_ARCH__>=7
471 .arch armv7-a
472 .fpu neon
473
474 .global sha256_block_data_order_neon
475 .type sha256_block_data_order_neon,%function
476 .align 4
477 sha256_block_data_order_neon:
478 .LNEON:
479 stmdb sp!,{r4-r12,lr}
480
481 sub $H,sp,#16*4+16
482 adr $Ktbl,K256
483 bic $H,$H,#15 @ align for 128-bit stores
484 mov $t2,sp
485 mov sp,$H @ alloca
486 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
487
488 vld1.8 {@X[0]},[$inp]!
489 vld1.8 {@X[1]},[$inp]!
490 vld1.8 {@X[2]},[$inp]!
491 vld1.8 {@X[3]},[$inp]!
492 vld1.32 {$T0},[$Ktbl,:128]!
493 vld1.32 {$T1},[$Ktbl,:128]!
494 vld1.32 {$T2},[$Ktbl,:128]!
495 vld1.32 {$T3},[$Ktbl,:128]!
496 vrev32.8 @X[0],@X[0] @ yes, even on
497 str $ctx,[sp,#64]
498 vrev32.8 @X[1],@X[1] @ big-endian
499 str $inp,[sp,#68]
500 mov $Xfer,sp
501 vrev32.8 @X[2],@X[2]
502 str $len,[sp,#72]
503 vrev32.8 @X[3],@X[3]
504 str $t2,[sp,#76] @ save original sp
505 vadd.i32 $T0,$T0,@X[0]
506 vadd.i32 $T1,$T1,@X[1]
507 vst1.32 {$T0},[$Xfer,:128]!
508 vadd.i32 $T2,$T2,@X[2]
509 vst1.32 {$T1},[$Xfer,:128]!
510 vadd.i32 $T3,$T3,@X[3]
511 vst1.32 {$T2},[$Xfer,:128]!
512 vst1.32 {$T3},[$Xfer,:128]!
513
514 ldmia $ctx,{$A-$H}
515 sub $Xfer,$Xfer,#64
516 ldr $t1,[sp,#0]
517 eor $t2,$t2,$t2
518 eor $t3,$B,$C
519 b .L_00_48
520
521 .align 4
522 .L_00_48:
523 ___
524 &Xupdate(\&body_00_15);
525 &Xupdate(\&body_00_15);
526 &Xupdate(\&body_00_15);
527 &Xupdate(\&body_00_15);
528 $code.=<<___;
529 teq $t1,#0 @ check for K256 terminator
530 ldr $t1,[sp,#0]
531 sub $Xfer,$Xfer,#64
532 bne .L_00_48
533
534 ldr $inp,[sp,#68]
535 ldr $t0,[sp,#72]
536 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
537 teq $inp,$t0
538 it eq
539 subeq $inp,$inp,#64 @ avoid SEGV
540 vld1.8 {@X[0]},[$inp]! @ load next input block
541 vld1.8 {@X[1]},[$inp]!
542 vld1.8 {@X[2]},[$inp]!
543 vld1.8 {@X[3]},[$inp]!
544 it ne
545 strne $inp,[sp,#68]
546 mov $Xfer,sp
547 ___
548 &Xpreload(\&body_00_15);
549 &Xpreload(\&body_00_15);
550 &Xpreload(\&body_00_15);
551 &Xpreload(\&body_00_15);
552 $code.=<<___;
553 ldr $t0,[$t1,#0]
554 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
555 ldr $t2,[$t1,#4]
556 ldr $t3,[$t1,#8]
557 ldr $t4,[$t1,#12]
558 add $A,$A,$t0 @ accumulate
559 ldr $t0,[$t1,#16]
560 add $B,$B,$t2
561 ldr $t2,[$t1,#20]
562 add $C,$C,$t3
563 ldr $t3,[$t1,#24]
564 add $D,$D,$t4
565 ldr $t4,[$t1,#28]
566 add $E,$E,$t0
567 str $A,[$t1],#4
568 add $F,$F,$t2
569 str $B,[$t1],#4
570 add $G,$G,$t3
571 str $C,[$t1],#4
572 add $H,$H,$t4
573 str $D,[$t1],#4
574 stmia $t1,{$E-$H}
575
576 ittte ne
577 movne $Xfer,sp
578 ldrne $t1,[sp,#0]
579 eorne $t2,$t2,$t2
580 ldreq sp,[sp,#76] @ restore original sp
581 itt ne
582 eorne $t3,$B,$C
583 bne .L_00_48
584
585 ldmia sp!,{r4-r12,pc}
586 .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
587 #endif
588 ___
589 }}}
590 ######################################################################
591 # ARMv8 stuff
592 #
593 {{{
594 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
595 my @MSG=map("q$_",(8..11));
596 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
597 my $Ktbl="r3";
598
599 $code.=<<___;
600 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
601
602 # if defined(__thumb2__) && !defined(__APPLE__)
603 # define INST(a,b,c,d) .byte c,d|0xc,a,b
604 # else
605 # define INST(a,b,c,d) .byte a,b,c,d
606 # endif
607
608 .type sha256_block_data_order_armv8,%function
609 .align 5
610 sha256_block_data_order_armv8:
611 .LARMv8:
612 vld1.32 {$ABCD,$EFGH},[$ctx]
613 # ifdef __APPLE__
614 sub $Ktbl,$Ktbl,#256+32
615 # elif defined(__thumb2__)
616 adr $Ktbl,.LARMv8
617 sub $Ktbl,$Ktbl,#.LARMv8-K256
618 # else
619 adrl $Ktbl,K256
620 # endif
621 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
622
623 .Loop_v8:
624 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
625 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
626 vld1.32 {$W0},[$Ktbl]!
627 vrev32.8 @MSG[0],@MSG[0]
628 vrev32.8 @MSG[1],@MSG[1]
629 vrev32.8 @MSG[2],@MSG[2]
630 vrev32.8 @MSG[3],@MSG[3]
631 vmov $ABCD_SAVE,$ABCD @ offload
632 vmov $EFGH_SAVE,$EFGH
633 teq $inp,$len
634 ___
635 for($i=0;$i<12;$i++) {
636 $code.=<<___;
637 vld1.32 {$W1},[$Ktbl]!
638 vadd.i32 $W0,$W0,@MSG[0]
639 sha256su0 @MSG[0],@MSG[1]
640 vmov $abcd,$ABCD
641 sha256h $ABCD,$EFGH,$W0
642 sha256h2 $EFGH,$abcd,$W0
643 sha256su1 @MSG[0],@MSG[2],@MSG[3]
644 ___
645 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
646 }
647 $code.=<<___;
648 vld1.32 {$W1},[$Ktbl]!
649 vadd.i32 $W0,$W0,@MSG[0]
650 vmov $abcd,$ABCD
651 sha256h $ABCD,$EFGH,$W0
652 sha256h2 $EFGH,$abcd,$W0
653
654 vld1.32 {$W0},[$Ktbl]!
655 vadd.i32 $W1,$W1,@MSG[1]
656 vmov $abcd,$ABCD
657 sha256h $ABCD,$EFGH,$W1
658 sha256h2 $EFGH,$abcd,$W1
659
660 vld1.32 {$W1},[$Ktbl]
661 vadd.i32 $W0,$W0,@MSG[2]
662 sub $Ktbl,$Ktbl,#256-16 @ rewind
663 vmov $abcd,$ABCD
664 sha256h $ABCD,$EFGH,$W0
665 sha256h2 $EFGH,$abcd,$W0
666
667 vadd.i32 $W1,$W1,@MSG[3]
668 vmov $abcd,$ABCD
669 sha256h $ABCD,$EFGH,$W1
670 sha256h2 $EFGH,$abcd,$W1
671
672 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
673 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
674 it ne
675 bne .Loop_v8
676
677 vst1.32 {$ABCD,$EFGH},[$ctx]
678
679 ret @ bx lr
680 .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
681 #endif
682 ___
683 }}}
684 $code.=<<___;
685 .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
686 .align 2
687 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
688 .comm OPENSSL_armcap_P,4,4
689 #endif
690 ___
691
692 open SELF,$0;
693 while(<SELF>) {
694 next if (/^#!/);
695 last if (!s/^#/@/ and !/^$/);
696 print;
697 }
698 close SELF;
699
700 { my %opcode = (
701 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
702 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
703
704 sub unsha256 {
705 my ($mnemonic,$arg)=@_;
706
707 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
708 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
709 |(($2&7)<<17)|(($2&8)<<4)
710 |(($3&7)<<1) |(($3&8)<<2);
711 # since ARMv7 instructions are always encoded little-endian.
712 # correct solution is to use .inst directive, but older
713 # assemblers don't implement it:-(
714 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
715 $word&0xff,($word>>8)&0xff,
716 ($word>>16)&0xff,($word>>24)&0xff,
717 $mnemonic,$arg;
718 }
719 }
720 }
721
722 foreach (split($/,$code)) {
723
724 s/\`([^\`]*)\`/eval $1/geo;
725
726 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
727
728 s/\bret\b/bx lr/go or
729 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
730
731 print $_,"\n";
732 }
733
734 close STDOUT; # enforce flush