3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
12 # SHA256 block procedure for ARMv4. May 2007.
14 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
15 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16 # byte [on single-issue Xscale PXA250 core].
20 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
21 # Cortex A8 core and ~20 cycles per processed byte.
25 # Profiler-assisted and platform-specific optimization resulted in 16%
26 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
30 # Add NEON implementation. On Cortex A8 it was measured to process one
31 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33 # code (meaning that latter performs sub-optimally, nothing was done
38 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
41 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
42 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
44 if ($flavour && $flavour ne "void") {
45 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
46 ( $xlate="${dir}arm-xlate.pl" and -f
$xlate ) or
47 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f
$xlate) or
48 die "can't locate arm-xlate.pl";
50 open STDOUT
,"| \"$^X\" $xlate $flavour $output";
52 open STDOUT
,">$output";
67 @V=($A,$B,$C,$D,$E,$F,$G,$H);
77 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
79 $code.=<<___
if ($i<16);
81 @ ldr
$t1,[$inp],#4 @ $i
83 str
$inp,[sp
,#17*4] @ make room for $t4
85 eor
$t0,$e,$e,ror
#`$Sigma1[1]-$Sigma1[0]`
86 add
$a,$a,$t2 @ h
+=Maj
(a
,b
,c
) from the past
87 eor
$t0,$t0,$e,ror
#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
92 @ ldrb
$t1,[$inp,#3] @ $i
93 add
$a,$a,$t2 @ h
+=Maj
(a
,b
,c
) from the past
98 orr
$t1,$t1,$t0,lsl
#16
100 str
$inp,[sp
,#17*4] @ make room for $t4
102 eor
$t0,$e,$e,ror
#`$Sigma1[1]-$Sigma1[0]`
103 orr
$t1,$t1,$t2,lsl
#24
104 eor
$t0,$t0,$e,ror
#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
108 ldr
$t2,[$Ktbl],#4 @ *K256++
109 add
$h,$h,$t1 @ h
+=X
[i
]
110 str
$t1,[sp
,#`$i%16`*4]
112 add
$h,$h,$t0,ror
#$Sigma1[0] @ h+=Sigma1(e)
114 add
$h,$h,$t2 @ h
+=K256
[i
]
115 eor
$t1,$t1,$g @ Ch
(e
,f
,g
)
116 eor
$t0,$a,$a,ror
#`$Sigma0[1]-$Sigma0[0]`
117 add
$h,$h,$t1 @ h
+=Ch
(e
,f
,g
)
120 cmp $t2,#0xf2 @ done?
124 ldr
$t1,[$inp],#4 @ prefetch
128 eor
$t2,$a,$b @ a
^b
, b
^c
in next round
130 ldr
$t1,[sp
,#`($i+2)%16`*4] @ from future BODY_16_xx
131 eor
$t2,$a,$b @ a
^b
, b
^c
in next round
132 ldr
$t4,[sp
,#`($i+15)%16`*4] @ from future BODY_16_xx
134 eor
$t0,$t0,$a,ror
#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
135 and $t3,$t3,$t2 @
(b
^c
)&=(a
^b
)
137 eor
$t3,$t3,$b @ Maj
(a
,b
,c
)
138 add
$h,$h,$t0,ror
#$Sigma0[0] @ h+=Sigma0(a)
139 @ add
$h,$h,$t3 @ h
+=Maj
(a
,b
,c
)
145 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
148 @ ldr
$t1,[sp
,#`($i+1)%16`*4] @ $i
149 @ ldr
$t4,[sp
,#`($i+14)%16`*4]
150 mov
$t0,$t1,ror
#$sigma0[0]
151 add
$a,$a,$t2 @ h
+=Maj
(a
,b
,c
) from the past
152 mov
$t2,$t4,ror
#$sigma1[0]
153 eor
$t0,$t0,$t1,ror
#$sigma0[1]
154 eor
$t2,$t2,$t4,ror
#$sigma1[1]
155 eor
$t0,$t0,$t1,lsr
#$sigma0[2] @ sigma0(X[i+1])
156 ldr
$t1,[sp
,#`($i+0)%16`*4]
157 eor
$t2,$t2,$t4,lsr
#$sigma1[2] @ sigma1(X[i+14])
158 ldr
$t4,[sp
,#`($i+9)%16`*4]
161 eor
$t0,$e,$e,ror
#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
163 eor
$t0,$t0,$e,ror
#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
164 add
$t1,$t1,$t4 @ X
[i
]
171 # include "arm_arch.h"
173 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
174 # define __ARM_MAX_ARCH__ 7
182 # if defined(__thumb2__) && !defined(__APPLE__)
193 .word
0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
194 .word
0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
195 .word
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
196 .word
0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
197 .word
0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
198 .word
0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
199 .word
0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
200 .word
0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
201 .word
0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
202 .word
0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
203 .word
0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
204 .word
0xd192e819,0xd6990624,0xf40e3585,0x106aa070
205 .word
0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
206 .word
0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
207 .word
0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
208 .word
0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
211 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
213 .word OPENSSL_armcap_P
-.Lsha256_block_data_order
217 .global sha256_block_data_order
218 .type sha256_block_data_order
,%function
219 sha256_block_data_order
:
220 .Lsha256_block_data_order
:
222 sub r3
,pc
,#8 @ sha256_block_data_order
224 adr r3
,sha256_block_data_order
226 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
227 ldr r12
,.LOPENSSL_armcap
228 ldr r12
,[r3
,r12
] @ OPENSSL_armcap_P
232 tst r12
,#ARMV8_SHA256
237 add
$len,$inp,$len,lsl
#6 @ len to point at the end of inp
238 stmdb sp
!,{$ctx,$inp,$len,r4
-r11
,lr
}
239 ldmia
$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
240 sub $Ktbl,r3
,#256+32 @ K256
241 sub sp
,sp
,#16*4 @ alloca(X[16])
248 eor
$t3,$B,$C @ magic
251 for($i=0;$i<16;$i++) { &BODY_00_15
($i,@V); unshift(@V,pop(@V)); }
252 $code.=".Lrounds_16_xx:\n";
253 for (;$i<32;$i++) { &BODY_16_XX
($i,@V); unshift(@V,pop(@V)); }
256 ite
eq @ Thumb2 thing
, sanity check
in ARM
258 ldreq
$t3,[sp
,#16*4] @ pull ctx
261 add
$A,$A,$t2 @ h
+=Maj
(a
,b
,c
) from the past
276 ldr
$inp,[sp
,#17*4] @ pull inp
277 ldr
$t2,[sp
,#18*4] @ pull inp+len
280 stmia
$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
282 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
285 add sp
,sp
,#`16+3`*4 @ destroy frame
287 ldmia sp
!,{r4
-r11
,pc
}
289 ldmia sp
!,{r4
-r11
,lr
}
291 moveq pc
,lr @ be binary compatible with V4
, yet
292 bx lr @ interoperable with Thumb ISA
:-)
294 .size sha256_block_data_order
,.-sha256_block_data_order
296 ######################################################################
300 my @X=map("q$_",(0..3));
301 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
305 sub Dlo
() { shift=~m
|q
([1]?
[0-9])|?
"d".($1*2):""; }
306 sub Dhi
() { shift=~m
|q
([1]?
[0-9])|?
"d".($1*2+1):""; }
308 sub AUTOLOAD
() # thunk [simplified] x86-style perlasm
309 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
311 $arg = "#$arg" if ($arg*1 eq $arg);
312 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
318 my @insns = (&$body,&$body,&$body,&$body);
319 my ($a,$b,$c,$d,$e,$f,$g,$h);
321 &vext_8
($T0,@X[0],@X[1],4); # X[1..4]
325 &vext_8
($T1,@X[2],@X[3],4); # X[9..12]
329 &vshr_u32
($T2,$T0,$sigma0[0]);
332 &vadd_i32
(@X[0],@X[0],$T1); # X[0..3] += X[9..12]
335 &vshr_u32
($T1,$T0,$sigma0[2]);
338 &vsli_32
($T2,$T0,32-$sigma0[0]);
341 &vshr_u32
($T3,$T0,$sigma0[1]);
347 &vsli_32
($T3,$T0,32-$sigma0[1]);
350 &vshr_u32
($T4,&Dhi
(@X[3]),$sigma1[0]);
353 &veor
($T1,$T1,$T3); # sigma0(X[1..4])
356 &vsli_32
($T4,&Dhi
(@X[3]),32-$sigma1[0]);
359 &vshr_u32
($T5,&Dhi
(@X[3]),$sigma1[2]);
362 &vadd_i32
(@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
368 &vshr_u32
($T4,&Dhi
(@X[3]),$sigma1[1]);
371 &vsli_32
($T4,&Dhi
(@X[3]),32-$sigma1[1]);
374 &veor
($T5,$T5,$T4); # sigma1(X[14..15])
377 &vadd_i32
(&Dlo
(@X[0]),&Dlo
(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
380 &vshr_u32
($T4,&Dlo
(@X[0]),$sigma1[0]);
383 &vsli_32
($T4,&Dlo
(@X[0]),32-$sigma1[0]);
386 &vshr_u32
($T5,&Dlo
(@X[0]),$sigma1[2]);
392 &vshr_u32
($T4,&Dlo
(@X[0]),$sigma1[1]);
395 &vld1_32
("{$T0}","[$Ktbl,:128]!");
398 &vsli_32
($T4,&Dlo
(@X[0]),32-$sigma1[1]);
401 &veor
($T5,$T5,$T4); # sigma1(X[16..17])
404 &vadd_i32
(&Dhi
(@X[0]),&Dhi
(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
407 &vadd_i32
($T0,$T0,@X[0]);
408 while($#insns>=2) { eval(shift(@insns)); }
409 &vst1_32
("{$T0}","[$Xfer,:128]!");
413 push(@X,shift(@X)); # "rotate" X[]
419 my @insns = (&$body,&$body,&$body,&$body);
420 my ($a,$b,$c,$d,$e,$f,$g,$h);
426 &vld1_32
("{$T0}","[$Ktbl,:128]!");
431 &vrev32_8
(@X[0],@X[0]);
436 &vadd_i32
($T0,$T0,@X[0]);
437 foreach (@insns) { eval; } # remaining instructions
438 &vst1_32
("{$T0}","[$Xfer,:128]!");
440 push(@X,shift(@X)); # "rotate" X[]
445 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
446 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
448 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
449 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
451 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
452 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
453 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
454 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
455 '&eor ($t2,$a,$b)', # a^b, b^c in next round
456 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
457 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
458 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
459 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
460 '&ldr ($t1,"[sp,#64]") if ($j==31)',
461 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
462 '&add ($d,$d,$h)', # d+=h
463 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
464 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
465 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
470 #if __ARM_MAX_ARCH__>=7
474 .global sha256_block_data_order_neon
475 .type sha256_block_data_order_neon
,%function
477 sha256_block_data_order_neon
:
479 stmdb sp
!,{r4
-r12
,lr
}
483 bic
$H,$H,#15 @ align for 128-bit stores
486 add
$len,$inp,$len,lsl
#6 @ len to point at the end of inp
488 vld1
.8
{@X[0]},[$inp]!
489 vld1
.8
{@X[1]},[$inp]!
490 vld1
.8
{@X[2]},[$inp]!
491 vld1
.8
{@X[3]},[$inp]!
492 vld1
.32
{$T0},[$Ktbl,:128]!
493 vld1
.32
{$T1},[$Ktbl,:128]!
494 vld1
.32
{$T2},[$Ktbl,:128]!
495 vld1
.32
{$T3},[$Ktbl,:128]!
496 vrev32
.8
@X[0],@X[0] @ yes
, even on
498 vrev32
.8
@X[1],@X[1] @ big
-endian
504 str
$t2,[sp
,#76] @ save original sp
505 vadd
.i32
$T0,$T0,@X[0]
506 vadd
.i32
$T1,$T1,@X[1]
507 vst1
.32
{$T0},[$Xfer,:128]!
508 vadd
.i32
$T2,$T2,@X[2]
509 vst1
.32
{$T1},[$Xfer,:128]!
510 vadd
.i32
$T3,$T3,@X[3]
511 vst1
.32
{$T2},[$Xfer,:128]!
512 vst1
.32
{$T3},[$Xfer,:128]!
524 &Xupdate
(\
&body_00_15
);
525 &Xupdate
(\
&body_00_15
);
526 &Xupdate
(\
&body_00_15
);
527 &Xupdate
(\
&body_00_15
);
529 teq
$t1,#0 @ check for K256 terminator
536 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
539 subeq
$inp,$inp,#64 @ avoid SEGV
540 vld1
.8
{@X[0]},[$inp]! @ load
next input block
541 vld1
.8
{@X[1]},[$inp]!
542 vld1
.8
{@X[2]},[$inp]!
543 vld1
.8
{@X[3]},[$inp]!
548 &Xpreload
(\
&body_00_15
);
549 &Xpreload
(\
&body_00_15
);
550 &Xpreload
(\
&body_00_15
);
551 &Xpreload
(\
&body_00_15
);
554 add
$A,$A,$t2 @ h
+=Maj
(a
,b
,c
) from the past
558 add
$A,$A,$t0 @ accumulate
580 ldreq sp
,[sp
,#76] @ restore original sp
585 ldmia sp
!,{r4
-r12
,pc
}
586 .size sha256_block_data_order_neon
,.-sha256_block_data_order_neon
590 ######################################################################
594 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
595 my @MSG=map("q$_",(8..11));
596 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
600 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
602 # if defined(__thumb2__) && !defined(__APPLE__)
603 # define INST(a,b,c,d) .byte c,d|0xc,a,b
605 # define INST(a,b,c,d) .byte a,b,c,d
608 .type sha256_block_data_order_armv8
,%function
610 sha256_block_data_order_armv8
:
612 vld1
.32
{$ABCD,$EFGH},[$ctx]
614 sub $Ktbl,$Ktbl,#256+32
615 # elif defined(__thumb2__)
617 sub $Ktbl,$Ktbl,#.LARMv8-K256
621 add
$len,$inp,$len,lsl
#6 @ len to point at the end of inp
624 vld1
.8
{@MSG[0]-@MSG[1]},[$inp]!
625 vld1
.8
{@MSG[2]-@MSG[3]},[$inp]!
626 vld1
.32
{$W0},[$Ktbl]!
627 vrev32
.8
@MSG[0],@MSG[0]
628 vrev32
.8
@MSG[1],@MSG[1]
629 vrev32
.8
@MSG[2],@MSG[2]
630 vrev32
.8
@MSG[3],@MSG[3]
631 vmov
$ABCD_SAVE,$ABCD @ offload
632 vmov
$EFGH_SAVE,$EFGH
635 for($i=0;$i<12;$i++) {
637 vld1
.32
{$W1},[$Ktbl]!
638 vadd
.i32
$W0,$W0,@MSG[0]
639 sha256su0
@MSG[0],@MSG[1]
641 sha256h
$ABCD,$EFGH,$W0
642 sha256h2
$EFGH,$abcd,$W0
643 sha256su1
@MSG[0],@MSG[2],@MSG[3]
645 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
648 vld1
.32
{$W1},[$Ktbl]!
649 vadd
.i32
$W0,$W0,@MSG[0]
651 sha256h
$ABCD,$EFGH,$W0
652 sha256h2
$EFGH,$abcd,$W0
654 vld1
.32
{$W0},[$Ktbl]!
655 vadd
.i32
$W1,$W1,@MSG[1]
657 sha256h
$ABCD,$EFGH,$W1
658 sha256h2
$EFGH,$abcd,$W1
660 vld1
.32
{$W1},[$Ktbl]
661 vadd
.i32
$W0,$W0,@MSG[2]
662 sub $Ktbl,$Ktbl,#256-16 @ rewind
664 sha256h
$ABCD,$EFGH,$W0
665 sha256h2
$EFGH,$abcd,$W0
667 vadd
.i32
$W1,$W1,@MSG[3]
669 sha256h
$ABCD,$EFGH,$W1
670 sha256h2
$EFGH,$abcd,$W1
672 vadd
.i32
$ABCD,$ABCD,$ABCD_SAVE
673 vadd
.i32
$EFGH,$EFGH,$EFGH_SAVE
677 vst1
.32
{$ABCD,$EFGH},[$ctx]
680 .size sha256_block_data_order_armv8
,.-sha256_block_data_order_armv8
685 .asciz
"SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
687 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
688 .comm OPENSSL_armcap_P
,4,4
695 last if (!s/^#/@/ and !/^$/);
701 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
702 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
705 my ($mnemonic,$arg)=@_;
707 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
708 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
709 |(($2&7)<<17)|(($2&8)<<4)
710 |(($3&7)<<1) |(($3&8)<<2);
711 # since ARMv7 instructions are always encoded little-endian.
712 # correct solution is to use .inst directive, but older
713 # assemblers don't implement it:-(
714 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
715 $word&0xff,($word>>8)&0xff,
716 ($word>>16)&0xff,($word>>24)&0xff,
722 foreach (split($/,$code)) {
724 s/\`([^\`]*)\`/eval $1/geo;
726 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
728 s/\bret\b/bx lr/go or
729 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
734 close STDOUT
; # enforce flush