]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha256-armv4.pl
Remove inconsistency in ARM support.
[thirdparty/openssl.git] / crypto / sha / asm / sha256-armv4.pl
CommitLineData
a1a382db
AP
1#!/usr/bin/env perl
2
3# ====================================================================
e3414062 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
a1a382db
AP
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
399f94bf 10# SHA256 block procedure for ARMv4. May 2007.
a1a382db 11
399f94bf
AP
12# Performance is ~2x better than gcc 3.4 generated code and in "abso-
13# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
4e2b9907 14# byte [on single-issue Xscale PXA250 core].
a1a382db 15
4e2b9907
AP
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 22% improvement on
19# Cortex A8 core and ~20 cycles per processed byte.
20
88cb5972
AP
21# February 2011.
22#
23# Profiler-assisted and platform-specific optimization resulted in 16%
e3414062
AP
24# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
25
26# September 2013.
27#
28# Add NEON implementation. On Cortex A8 it was measured to process one
29# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
30# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
31# code (meaning that latter performs sub-optimally, nothing was done
32# about it).
88cb5972 33
f5247cea
AP
34# May 2014.
35#
36# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
37
4e2b9907 38while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
4c7c5ff6
AP
39open STDOUT,">$output";
40
a1a382db 41$ctx="r0"; $t0="r0";
422c8c36 42$inp="r1"; $t4="r1";
a1a382db 43$len="r2"; $t1="r2";
422c8c36 44$T1="r3"; $t3="r3";
a1a382db
AP
45$A="r4";
46$B="r5";
47$C="r6";
48$D="r7";
49$E="r8";
50$F="r9";
51$G="r10";
52$H="r11";
53@V=($A,$B,$C,$D,$E,$F,$G,$H);
54$t2="r12";
55$Ktbl="r14";
56
57@Sigma0=( 2,13,22);
58@Sigma1=( 6,11,25);
59@sigma0=( 7,18, 3);
60@sigma1=(17,19,10);
61
62sub BODY_00_15 {
63my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
64
65$code.=<<___ if ($i<16);
88cb5972 66#if __ARM_ARCH__>=7
422c8c36
AP
67 @ ldr $t1,[$inp],#4 @ $i
68# if $i==15
69 str $inp,[sp,#17*4] @ make room for $t4
70# endif
e3414062 71 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
422c8c36 72 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
e3414062 73 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
422c8c36 74 rev $t1,$t1
88cb5972 75#else
422c8c36
AP
76 @ ldrb $t1,[$inp,#3] @ $i
77 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
a1a382db 78 ldrb $t2,[$inp,#2]
422c8c36
AP
79 ldrb $t0,[$inp,#1]
80 orr $t1,$t1,$t2,lsl#8
81 ldrb $t2,[$inp],#4
82 orr $t1,$t1,$t0,lsl#16
83# if $i==15
84 str $inp,[sp,#17*4] @ make room for $t4
85# endif
e3414062 86 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
422c8c36 87 orr $t1,$t1,$t2,lsl#24
e3414062 88 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
88cb5972 89#endif
a1a382db
AP
90___
91$code.=<<___;
88cb5972 92 ldr $t2,[$Ktbl],#4 @ *K256++
422c8c36
AP
93 add $h,$h,$t1 @ h+=X[i]
94 str $t1,[sp,#`$i%16`*4]
a1a382db 95 eor $t1,$f,$g
e3414062 96 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
a1a382db 97 and $t1,$t1,$e
422c8c36 98 add $h,$h,$t2 @ h+=K256[i]
e3414062
AP
99 eor $t1,$t1,$g @ Ch(e,f,g)
100 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
422c8c36
AP
101 add $h,$h,$t1 @ h+=Ch(e,f,g)
102#if $i==31
103 and $t2,$t2,#0xff
104 cmp $t2,#0xf2 @ done?
88cb5972 105#endif
422c8c36
AP
106#if $i<15
107# if __ARM_ARCH__>=7
108 ldr $t1,[$inp],#4 @ prefetch
109# else
110 ldrb $t1,[$inp,#3]
111# endif
112 eor $t2,$a,$b @ a^b, b^c in next round
113#else
114 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
115 eor $t2,$a,$b @ a^b, b^c in next round
116 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
117#endif
e3414062 118 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
422c8c36
AP
119 and $t3,$t3,$t2 @ (b^c)&=(a^b)
120 add $d,$d,$h @ d+=h
422c8c36 121 eor $t3,$t3,$b @ Maj(a,b,c)
e3414062 122 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
422c8c36 123 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
a1a382db 124___
422c8c36 125 ($t2,$t3)=($t3,$t2);
a1a382db
AP
126}
127
128sub BODY_16_XX {
129my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
130
131$code.=<<___;
422c8c36
AP
132 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
133 @ ldr $t4,[sp,#`($i+14)%16`*4]
134 mov $t0,$t1,ror#$sigma0[0]
135 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
136 mov $t2,$t4,ror#$sigma1[0]
137 eor $t0,$t0,$t1,ror#$sigma0[1]
138 eor $t2,$t2,$t4,ror#$sigma1[1]
139 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
140 ldr $t1,[sp,#`($i+0)%16`*4]
141 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
142 ldr $t4,[sp,#`($i+9)%16`*4]
143
144 add $t2,$t2,$t0
e3414062 145 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
422c8c36 146 add $t1,$t1,$t2
e3414062 147 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
422c8c36 148 add $t1,$t1,$t4 @ X[i]
a1a382db
AP
149___
150 &BODY_00_15(@_);
151}
152
153$code=<<___;
88cb5972
AP
154#include "arm_arch.h"
155
a1a382db
AP
156.text
157.code 32
158
159.type K256,%object
160.align 5
161K256:
162.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
163.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
164.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
165.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
166.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
167.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
168.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
169.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
170.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
171.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
172.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
173.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
174.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
175.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
176.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
177.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
178.size K256,.-K256
e3414062 179.word 0 @ terminator
f4868c99 180#if __ARM_MAX_ARCH__>=7
e3414062
AP
181.LOPENSSL_armcap:
182.word OPENSSL_armcap_P-sha256_block_data_order
f4868c99 183#endif
e3414062 184.align 5
a1a382db
AP
185
186.global sha256_block_data_order
187.type sha256_block_data_order,%function
188sha256_block_data_order:
189 sub r3,pc,#8 @ sha256_block_data_order
190 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
f4868c99 191#if __ARM_MAX_ARCH__>=7
e3414062
AP
192 ldr r12,.LOPENSSL_armcap
193 ldr r12,[r3,r12] @ OPENSSL_armcap_P
1f72a76f 194 tst r12,#ARMV8_SHA256
f5247cea 195 bne .LARMv8
1f72a76f 196 tst r12,#ARMV7_NEON
e3414062
AP
197 bne .LNEON
198#endif
88cb5972 199 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
a1a382db 200 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
e3414062 201 sub $Ktbl,r3,#256+32 @ K256
a1a382db
AP
202 sub sp,sp,#16*4 @ alloca(X[16])
203.Loop:
422c8c36
AP
204# if __ARM_ARCH__>=7
205 ldr $t1,[$inp],#4
206# else
207 ldrb $t1,[$inp,#3]
208# endif
209 eor $t3,$B,$C @ magic
210 eor $t2,$t2,$t2
a1a382db
AP
211___
212for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
213$code.=".Lrounds_16_xx:\n";
214for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
215$code.=<<___;
422c8c36 216 ldreq $t3,[sp,#16*4] @ pull ctx
a1a382db
AP
217 bne .Lrounds_16_xx
218
422c8c36
AP
219 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
220 ldr $t0,[$t3,#0]
221 ldr $t1,[$t3,#4]
222 ldr $t2,[$t3,#8]
a1a382db 223 add $A,$A,$t0
422c8c36 224 ldr $t0,[$t3,#12]
a1a382db 225 add $B,$B,$t1
422c8c36 226 ldr $t1,[$t3,#16]
a1a382db 227 add $C,$C,$t2
422c8c36 228 ldr $t2,[$t3,#20]
a1a382db 229 add $D,$D,$t0
422c8c36 230 ldr $t0,[$t3,#24]
a1a382db 231 add $E,$E,$t1
422c8c36 232 ldr $t1,[$t3,#28]
a1a382db
AP
233 add $F,$F,$t2
234 ldr $inp,[sp,#17*4] @ pull inp
235 ldr $t2,[sp,#18*4] @ pull inp+len
236 add $G,$G,$t0
237 add $H,$H,$t1
422c8c36 238 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
a1a382db
AP
239 cmp $inp,$t2
240 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
241 bne .Loop
242
243 add sp,sp,#`16+3`*4 @ destroy frame
88cb5972
AP
244#if __ARM_ARCH__>=5
245 ldmia sp!,{r4-r11,pc}
246#else
247 ldmia sp!,{r4-r11,lr}
a1a382db
AP
248 tst lr,#1
249 moveq pc,lr @ be binary compatible with V4, yet
250 bx lr @ interoperable with Thumb ISA:-)
88cb5972 251#endif
f5247cea 252.size sha256_block_data_order,.-sha256_block_data_order
e3414062
AP
253___
254######################################################################
255# NEON stuff
256#
257{{{
258my @X=map("q$_",(0..3));
259my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
260my $Xfer=$t4;
261my $j=0;
262
263sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
264sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
265
266sub AUTOLOAD() # thunk [simplified] x86-style perlasm
267{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
268 my $arg = pop;
269 $arg = "#$arg" if ($arg*1 eq $arg);
270 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
271}
272
273sub Xupdate()
274{ use integer;
275 my $body = shift;
276 my @insns = (&$body,&$body,&$body,&$body);
277 my ($a,$b,$c,$d,$e,$f,$g,$h);
278
279 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
280 eval(shift(@insns));
281 eval(shift(@insns));
282 eval(shift(@insns));
283 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
284 eval(shift(@insns));
285 eval(shift(@insns));
286 eval(shift(@insns));
287 &vshr_u32 ($T2,$T0,$sigma0[0]);
288 eval(shift(@insns));
289 eval(shift(@insns));
290 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
291 eval(shift(@insns));
292 eval(shift(@insns));
293 &vshr_u32 ($T1,$T0,$sigma0[2]);
294 eval(shift(@insns));
295 eval(shift(@insns));
296 &vsli_32 ($T2,$T0,32-$sigma0[0]);
297 eval(shift(@insns));
298 eval(shift(@insns));
299 &vshr_u32 ($T3,$T0,$sigma0[1]);
300 eval(shift(@insns));
301 eval(shift(@insns));
302 &veor ($T1,$T1,$T2);
303 eval(shift(@insns));
304 eval(shift(@insns));
305 &vsli_32 ($T3,$T0,32-$sigma0[1]);
306 eval(shift(@insns));
307 eval(shift(@insns));
308 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
309 eval(shift(@insns));
310 eval(shift(@insns));
311 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
312 eval(shift(@insns));
313 eval(shift(@insns));
314 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
315 eval(shift(@insns));
316 eval(shift(@insns));
317 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
318 eval(shift(@insns));
319 eval(shift(@insns));
320 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
321 eval(shift(@insns));
322 eval(shift(@insns));
323 &veor ($T5,$T5,$T4);
324 eval(shift(@insns));
325 eval(shift(@insns));
326 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
327 eval(shift(@insns));
328 eval(shift(@insns));
329 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
330 eval(shift(@insns));
331 eval(shift(@insns));
332 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
333 eval(shift(@insns));
334 eval(shift(@insns));
335 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
336 eval(shift(@insns));
337 eval(shift(@insns));
338 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
339 eval(shift(@insns));
340 eval(shift(@insns));
341 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
342 eval(shift(@insns));
343 eval(shift(@insns));
344 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
345 eval(shift(@insns));
346 eval(shift(@insns));
347 &veor ($T5,$T5,$T4);
348 eval(shift(@insns));
349 eval(shift(@insns));
350 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
351 eval(shift(@insns));
352 eval(shift(@insns));
353 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
354 eval(shift(@insns));
355 eval(shift(@insns));
356 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
357 eval(shift(@insns));
358 eval(shift(@insns));
359 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
360 eval(shift(@insns));
361 eval(shift(@insns));
362 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
363 eval(shift(@insns));
364 eval(shift(@insns));
365 &vadd_i32 ($T0,$T0,@X[0]);
366 while($#insns>=2) { eval(shift(@insns)); }
367 &vst1_32 ("{$T0}","[$Xfer,:128]!");
368 eval(shift(@insns));
369 eval(shift(@insns));
370
371 push(@X,shift(@X)); # "rotate" X[]
372}
373
374sub Xpreload()
375{ use integer;
376 my $body = shift;
377 my @insns = (&$body,&$body,&$body,&$body);
378 my ($a,$b,$c,$d,$e,$f,$g,$h);
379
380 eval(shift(@insns));
381 eval(shift(@insns));
382 eval(shift(@insns));
383 eval(shift(@insns));
384 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
385 eval(shift(@insns));
386 eval(shift(@insns));
387 eval(shift(@insns));
388 eval(shift(@insns));
389 &vrev32_8 (@X[0],@X[0]);
390 eval(shift(@insns));
391 eval(shift(@insns));
392 eval(shift(@insns));
393 eval(shift(@insns));
394 &vadd_i32 ($T0,$T0,@X[0]);
395 foreach (@insns) { eval; } # remaining instructions
396 &vst1_32 ("{$T0}","[$Xfer,:128]!");
397
398 push(@X,shift(@X)); # "rotate" X[]
399}
400
401sub body_00_15 () {
402 (
403 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
404 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
405 '&eor ($t1,$f,$g)',
406 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
407 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
408 '&and ($t1,$t1,$e)',
409 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
410 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
411 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
412 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
413 '&eor ($t2,$a,$b)', # a^b, b^c in next round
414 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
415 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
416 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
417 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
418 '&ldr ($t1,"[sp,#64]") if ($j==31)',
419 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
420 '&add ($d,$d,$h)', # d+=h
421 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
422 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
423 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
424 )
425}
426
427$code.=<<___;
f4868c99
AP
428#if __ARM_MAX_ARCH__>=7
429.arch armv7-a
e3414062 430.fpu neon
f5247cea
AP
431
432.type sha256_block_data_order_neon,%function
e3414062 433.align 4
f5247cea 434sha256_block_data_order_neon:
e3414062
AP
435.LNEON:
436 stmdb sp!,{r4-r12,lr}
437
438 mov $t2,sp
439 sub sp,sp,#16*4+16 @ alloca
440 sub $Ktbl,r3,#256+32 @ K256
441 bic sp,sp,#15 @ align for 128-bit stores
442
443 vld1.8 {@X[0]},[$inp]!
444 vld1.8 {@X[1]},[$inp]!
445 vld1.8 {@X[2]},[$inp]!
446 vld1.8 {@X[3]},[$inp]!
447 vld1.32 {$T0},[$Ktbl,:128]!
448 vld1.32 {$T1},[$Ktbl,:128]!
449 vld1.32 {$T2},[$Ktbl,:128]!
450 vld1.32 {$T3},[$Ktbl,:128]!
451 vrev32.8 @X[0],@X[0] @ yes, even on
452 str $ctx,[sp,#64]
453 vrev32.8 @X[1],@X[1] @ big-endian
454 str $inp,[sp,#68]
455 mov $Xfer,sp
456 vrev32.8 @X[2],@X[2]
457 str $len,[sp,#72]
458 vrev32.8 @X[3],@X[3]
459 str $t2,[sp,#76] @ save original sp
460 vadd.i32 $T0,$T0,@X[0]
461 vadd.i32 $T1,$T1,@X[1]
462 vst1.32 {$T0},[$Xfer,:128]!
463 vadd.i32 $T2,$T2,@X[2]
464 vst1.32 {$T1},[$Xfer,:128]!
465 vadd.i32 $T3,$T3,@X[3]
466 vst1.32 {$T2},[$Xfer,:128]!
467 vst1.32 {$T3},[$Xfer,:128]!
468
469 ldmia $ctx,{$A-$H}
470 sub $Xfer,$Xfer,#64
471 ldr $t1,[sp,#0]
472 eor $t2,$t2,$t2
473 eor $t3,$B,$C
474 b .L_00_48
475
476.align 4
477.L_00_48:
478___
479 &Xupdate(\&body_00_15);
480 &Xupdate(\&body_00_15);
481 &Xupdate(\&body_00_15);
482 &Xupdate(\&body_00_15);
483$code.=<<___;
484 teq $t1,#0 @ check for K256 terminator
485 ldr $t1,[sp,#0]
486 sub $Xfer,$Xfer,#64
487 bne .L_00_48
488
489 ldr $inp,[sp,#68]
490 ldr $t0,[sp,#72]
491 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
492 teq $inp,$t0
493 subeq $inp,$inp,#64 @ avoid SEGV
494 vld1.8 {@X[0]},[$inp]! @ load next input block
495 vld1.8 {@X[1]},[$inp]!
496 vld1.8 {@X[2]},[$inp]!
497 vld1.8 {@X[3]},[$inp]!
498 strne $inp,[sp,#68]
499 mov $Xfer,sp
500___
501 &Xpreload(\&body_00_15);
502 &Xpreload(\&body_00_15);
503 &Xpreload(\&body_00_15);
504 &Xpreload(\&body_00_15);
505$code.=<<___;
506 ldr $t0,[$t1,#0]
507 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
508 ldr $t2,[$t1,#4]
509 ldr $t3,[$t1,#8]
510 ldr $t4,[$t1,#12]
511 add $A,$A,$t0 @ accumulate
512 ldr $t0,[$t1,#16]
513 add $B,$B,$t2
514 ldr $t2,[$t1,#20]
515 add $C,$C,$t3
516 ldr $t3,[$t1,#24]
517 add $D,$D,$t4
518 ldr $t4,[$t1,#28]
519 add $E,$E,$t0
520 str $A,[$t1],#4
521 add $F,$F,$t2
522 str $B,[$t1],#4
523 add $G,$G,$t3
524 str $C,[$t1],#4
525 add $H,$H,$t4
526 str $D,[$t1],#4
527 stmia $t1,{$E-$H}
528
529 movne $Xfer,sp
530 ldrne $t1,[sp,#0]
531 eorne $t2,$t2,$t2
532 ldreq sp,[sp,#76] @ restore original sp
533 eorne $t3,$B,$C
534 bne .L_00_48
535
536 ldmia sp!,{r4-r12,pc}
f5247cea
AP
537.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
538#endif
539___
540}}}
541######################################################################
542# ARMv8 stuff
543#
544{{{
545my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
546my @MSG=map("q$_",(8..11));
547my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
548my $Ktbl="r3";
549
550$code.=<<___;
f4868c99 551#if __ARM_MAX_ARCH__>=7
f5247cea
AP
552.type sha256_block_data_order_armv8,%function
553.align 5
554sha256_block_data_order_armv8:
555.LARMv8:
556 vld1.32 {$ABCD,$EFGH},[$ctx]
557 sub $Ktbl,r3,#sha256_block_data_order-K256
558
559.Loop_v8:
560 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
561 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
562 vld1.32 {$W0},[$Ktbl]!
563 vrev32.8 @MSG[0],@MSG[0]
564 vrev32.8 @MSG[1],@MSG[1]
565 vrev32.8 @MSG[2],@MSG[2]
566 vrev32.8 @MSG[3],@MSG[3]
567 vmov $ABCD_SAVE,$ABCD @ offload
568 vmov $EFGH_SAVE,$EFGH
569 teq $inp,$len
570___
571for($i=0;$i<12;$i++) {
572$code.=<<___;
573 vld1.32 {$W1},[$Ktbl]!
574 vadd.i32 $W0,$W0,@MSG[0]
575 sha256su0 @MSG[0],@MSG[1]
576 vmov $abcd,$ABCD
577 sha256h $ABCD,$EFGH,$W0
578 sha256h2 $EFGH,$abcd,$W0
579 sha256su1 @MSG[0],@MSG[2],@MSG[3]
580___
581 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
582}
583$code.=<<___;
584 vld1.32 {$W1},[$Ktbl]!
585 vadd.i32 $W0,$W0,@MSG[0]
586 vmov $abcd,$ABCD
587 sha256h $ABCD,$EFGH,$W0
588 sha256h2 $EFGH,$abcd,$W0
589
590 vld1.32 {$W0},[$Ktbl]!
591 vadd.i32 $W1,$W1,@MSG[1]
592 vmov $abcd,$ABCD
593 sha256h $ABCD,$EFGH,$W1
594 sha256h2 $EFGH,$abcd,$W1
595
596 vld1.32 {$W1},[$Ktbl]
597 vadd.i32 $W0,$W0,@MSG[2]
598 sub $Ktbl,$Ktbl,#256-16 @ rewind
599 vmov $abcd,$ABCD
600 sha256h $ABCD,$EFGH,$W0
601 sha256h2 $EFGH,$abcd,$W0
602
603 vadd.i32 $W1,$W1,@MSG[3]
604 vmov $abcd,$ABCD
605 sha256h $ABCD,$EFGH,$W1
606 sha256h2 $EFGH,$abcd,$W1
607
608 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
609 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
610 bne .Loop_v8
611
612 vst1.32 {$ABCD,$EFGH},[$ctx]
613
3a97ebb1 614 ret @ bx lr
f5247cea 615.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
e3414062
AP
616#endif
617___
618}}}
619$code.=<<___;
f5247cea 620.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1244d5b7 621.align 2
f4868c99 622#if __ARM_MARCH_ARCH__>=7
e3414062 623.comm OPENSSL_armcap_P,4,4
f4868c99 624#endif
a1a382db
AP
625___
626
f5247cea
AP
627{ my %opcode = (
628 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
629 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
630
631 sub unsha256 {
632 my ($mnemonic,$arg)=@_;
633
3a97ebb1
AP
634 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
635 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
636 |(($2&7)<<17)|(($2&8)<<4)
637 |(($3&7)<<1) |(($3&8)<<2);
638 # since ARMv7 instructions are always encoded little-endian.
639 # correct solution is to use .inst directive, but older
640 # assemblers don't implement it:-(
641 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
642 $word&0xff,($word>>8)&0xff,
643 ($word>>16)&0xff,($word>>24)&0xff,
f5247cea 644 $mnemonic,$arg;
3a97ebb1 645 }
f5247cea
AP
646 }
647}
648
649foreach (split($/,$code)) {
650
651 s/\`([^\`]*)\`/eval $1/geo;
652
653 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
654
3a97ebb1 655 s/\bret\b/bx lr/go or
f5247cea
AP
656 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
657
658 print $_,"\n";
659}
660
4c7c5ff6 661close STDOUT; # enforce flush