2 # Copyright 2007-2023 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
16 # Permission to use under GPL terms is granted.
17 # ====================================================================
19 # SHA256 block procedure for ARMv4. May 2007.
21 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
22 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23 # byte [on single-issue Xscale PXA250 core].
27 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
28 # Cortex A8 core and ~20 cycles per processed byte.
32 # Profiler-assisted and platform-specific optimization resulted in 16%
33 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
37 # Add NEON implementation. On Cortex A8 it was measured to process one
38 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40 # code (meaning that latter performs sub-optimally, nothing was done
45 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
47 # $output is the last argument if it looks like a file (it has an extension)
48 # $flavour is the first argument if it doesn't look like a file
49 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m
|\
.\w
+$| ?
pop : undef;
50 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m
|\
.| ?
shift : undef;
52 if ($flavour && $flavour ne "void") {
53 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
54 ( $xlate="${dir}arm-xlate.pl" and -f
$xlate ) or
55 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f
$xlate) or
56 die "can't locate arm-xlate.pl";
58 open STDOUT
,"| \"$^X\" $xlate $flavour \"$output\""
59 or die "can't call $xlate: $!";
61 $output and open STDOUT
,">$output";
76 @V=($A,$B,$C,$D,$E,$F,$G,$H);
86 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
88 $code.=<<___
if ($i<16);
90 @ ldr
$t1,[$inp],#4 @ $i
92 str
$inp,[sp
,#17*4] @ make room for $t4
94 eor
$t0,$e,$e,ror
#`$Sigma1[1]-$Sigma1[0]`
95 add
$a,$a,$t2 @ h
+=Maj
(a
,b
,c
) from the past
96 eor
$t0,$t0,$e,ror
#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
101 @ ldrb
$t1,[$inp,#3] @ $i
102 add
$a,$a,$t2 @ h
+=Maj
(a
,b
,c
) from the past
105 orr
$t1,$t1,$t2,lsl
#8
107 orr
$t1,$t1,$t0,lsl
#16
109 str
$inp,[sp
,#17*4] @ make room for $t4
111 eor
$t0,$e,$e,ror
#`$Sigma1[1]-$Sigma1[0]`
112 orr
$t1,$t1,$t2,lsl
#24
113 eor
$t0,$t0,$e,ror
#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
117 ldr
$t2,[$Ktbl],#4 @ *K256++
118 add
$h,$h,$t1 @ h
+=X
[i
]
119 str
$t1,[sp
,#`$i%16`*4]
121 add
$h,$h,$t0,ror
#$Sigma1[0] @ h+=Sigma1(e)
123 add
$h,$h,$t2 @ h
+=K256
[i
]
124 eor
$t1,$t1,$g @ Ch
(e
,f
,g
)
125 eor
$t0,$a,$a,ror
#`$Sigma0[1]-$Sigma0[0]`
126 add
$h,$h,$t1 @ h
+=Ch
(e
,f
,g
)
129 cmp $t2,#0xf2 @ done?
133 ldr
$t1,[$inp],#4 @ prefetch
137 eor
$t2,$a,$b @ a
^b
, b
^c
in next round
139 ldr
$t1,[sp
,#`($i+2)%16`*4] @ from future BODY_16_xx
140 eor
$t2,$a,$b @ a
^b
, b
^c
in next round
141 ldr
$t4,[sp
,#`($i+15)%16`*4] @ from future BODY_16_xx
143 eor
$t0,$t0,$a,ror
#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
144 and $t3,$t3,$t2 @
(b
^c
)&=(a
^b
)
146 eor
$t3,$t3,$b @ Maj
(a
,b
,c
)
147 add
$h,$h,$t0,ror
#$Sigma0[0] @ h+=Sigma0(a)
148 @ add
$h,$h,$t3 @ h
+=Maj
(a
,b
,c
)
154 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
157 @ ldr
$t1,[sp
,#`($i+1)%16`*4] @ $i
158 @ ldr
$t4,[sp
,#`($i+14)%16`*4]
159 mov
$t0,$t1,ror
#$sigma0[0]
160 add
$a,$a,$t2 @ h
+=Maj
(a
,b
,c
) from the past
161 mov
$t2,$t4,ror
#$sigma1[0]
162 eor
$t0,$t0,$t1,ror
#$sigma0[1]
163 eor
$t2,$t2,$t4,ror
#$sigma1[1]
164 eor
$t0,$t0,$t1,lsr
#$sigma0[2] @ sigma0(X[i+1])
165 ldr
$t1,[sp
,#`($i+0)%16`*4]
166 eor
$t2,$t2,$t4,lsr
#$sigma1[2] @ sigma1(X[i+14])
167 ldr
$t4,[sp
,#`($i+9)%16`*4]
170 eor
$t0,$e,$e,ror
#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
172 eor
$t0,$t0,$e,ror
#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
173 add
$t1,$t1,$t4 @ X
[i
]
180 # include "arm_arch.h"
182 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
183 # define __ARM_MAX_ARCH__ 7
186 #if defined(__thumb2__)
198 .word
0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
199 .word
0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
200 .word
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
201 .word
0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
202 .word
0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
203 .word
0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
204 .word
0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
205 .word
0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
206 .word
0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
207 .word
0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
208 .word
0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
209 .word
0xd192e819,0xd6990624,0xf40e3585,0x106aa070
210 .word
0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
211 .word
0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
212 .word
0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
213 .word
0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
216 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
219 .word OPENSSL_armcap_P
221 .word OPENSSL_armcap_P
-.Lsha256_block_data_order
226 .global sha256_block_data_order
227 .type sha256_block_data_order
,%function
228 sha256_block_data_order
:
229 .Lsha256_block_data_order
:
230 #if __ARM_ARCH__<7 && !defined(__thumb2__)
231 sub r3
,pc
,#8 @ sha256_block_data_order
233 adr r3
,.Lsha256_block_data_order
235 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
236 ldr r12
,.LOPENSSL_armcap
237 # if !defined(_WIN32)
238 ldr r12
,[r3
,r12
] @ OPENSSL_armcap_P
240 # if defined(__APPLE__) || defined(_WIN32)
243 tst r12
,#ARMV8_SHA256
248 add
$len,$inp,$len,lsl
#6 @ len to point at the end of inp
249 stmdb sp
!,{$ctx,$inp,$len,r4
-r11
,lr
}
250 ldmia
$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
251 sub $Ktbl,r3
,#256+32 @ K256
252 sub sp
,sp
,#16*4 @ alloca(X[16])
259 eor
$t3,$B,$C @ magic
262 for($i=0;$i<16;$i++) { &BODY_00_15
($i,@V); unshift(@V,pop(@V)); }
263 $code.=".Lrounds_16_xx:\n";
264 for (;$i<32;$i++) { &BODY_16_XX
($i,@V); unshift(@V,pop(@V)); }
267 ite
eq @ Thumb2 thing
, sanity check
in ARM
269 ldreq
$t3,[sp
,#16*4] @ pull ctx
272 add
$A,$A,$t2 @ h
+=Maj
(a
,b
,c
) from the past
287 ldr
$inp,[sp
,#17*4] @ pull inp
288 ldr
$t2,[sp
,#18*4] @ pull inp+len
291 stmia
$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
293 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
296 add sp
,sp
,#`16+3`*4 @ destroy frame
298 ldmia sp
!,{r4
-r11
,pc
}
300 ldmia sp
!,{r4
-r11
,lr
}
302 moveq pc
,lr @ be binary compatible with V4
, yet
303 bx lr @ interoperable with Thumb ISA
:-)
305 .size sha256_block_data_order
,.-sha256_block_data_order
307 ######################################################################
311 my @X=map("q$_",(0..3));
312 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
316 sub Dlo
() { shift=~m
|q
([1]?
[0-9])|?
"d".($1*2):""; }
317 sub Dhi
() { shift=~m
|q
([1]?
[0-9])|?
"d".($1*2+1):""; }
319 sub AUTOLOAD
() # thunk [simplified] x86-style perlasm
320 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
322 $arg = "#$arg" if ($arg*1 eq $arg);
323 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
329 my @insns = (&$body,&$body,&$body,&$body);
330 my ($a,$b,$c,$d,$e,$f,$g,$h);
332 &vext_8
($T0,@X[0],@X[1],4); # X[1..4]
336 &vext_8
($T1,@X[2],@X[3],4); # X[9..12]
340 &vshr_u32
($T2,$T0,$sigma0[0]);
343 &vadd_i32
(@X[0],@X[0],$T1); # X[0..3] += X[9..12]
346 &vshr_u32
($T1,$T0,$sigma0[2]);
349 &vsli_32
($T2,$T0,32-$sigma0[0]);
352 &vshr_u32
($T3,$T0,$sigma0[1]);
358 &vsli_32
($T3,$T0,32-$sigma0[1]);
361 &vshr_u32
($T4,&Dhi
(@X[3]),$sigma1[0]);
364 &veor
($T1,$T1,$T3); # sigma0(X[1..4])
367 &vsli_32
($T4,&Dhi
(@X[3]),32-$sigma1[0]);
370 &vshr_u32
($T5,&Dhi
(@X[3]),$sigma1[2]);
373 &vadd_i32
(@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
379 &vshr_u32
($T4,&Dhi
(@X[3]),$sigma1[1]);
382 &vsli_32
($T4,&Dhi
(@X[3]),32-$sigma1[1]);
385 &veor
($T5,$T5,$T4); # sigma1(X[14..15])
388 &vadd_i32
(&Dlo
(@X[0]),&Dlo
(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
391 &vshr_u32
($T4,&Dlo
(@X[0]),$sigma1[0]);
394 &vsli_32
($T4,&Dlo
(@X[0]),32-$sigma1[0]);
397 &vshr_u32
($T5,&Dlo
(@X[0]),$sigma1[2]);
403 &vshr_u32
($T4,&Dlo
(@X[0]),$sigma1[1]);
406 &vld1_32
("{$T0}","[$Ktbl,:128]!");
409 &vsli_32
($T4,&Dlo
(@X[0]),32-$sigma1[1]);
412 &veor
($T5,$T5,$T4); # sigma1(X[16..17])
415 &vadd_i32
(&Dhi
(@X[0]),&Dhi
(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
418 &vadd_i32
($T0,$T0,@X[0]);
419 while($#insns>=2) { eval(shift(@insns)); }
420 &vst1_32
("{$T0}","[$Xfer,:128]!");
424 push(@X,shift(@X)); # "rotate" X[]
430 my @insns = (&$body,&$body,&$body,&$body);
431 my ($a,$b,$c,$d,$e,$f,$g,$h);
437 &vld1_32
("{$T0}","[$Ktbl,:128]!");
442 &vrev32_8
(@X[0],@X[0]);
447 &vadd_i32
($T0,$T0,@X[0]);
448 foreach (@insns) { eval; } # remaining instructions
449 &vst1_32
("{$T0}","[$Xfer,:128]!");
451 push(@X,shift(@X)); # "rotate" X[]
456 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
457 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
459 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
460 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
462 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
463 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
464 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
465 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
466 '&eor ($t2,$a,$b)', # a^b, b^c in next round
467 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
468 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
469 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
470 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
471 '&ldr ($t1,"[sp,#64]") if ($j==31)',
472 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
473 '&add ($d,$d,$h)', # d+=h
474 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
475 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
476 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
481 #if __ARM_MAX_ARCH__>=7
485 .global sha256_block_data_order_neon
486 .type sha256_block_data_order_neon
,%function
489 sha256_block_data_order_neon
:
491 stmdb sp
!,{r4
-r12
,lr
}
495 bic
$H,$H,#15 @ align for 128-bit stores
498 add
$len,$inp,$len,lsl
#6 @ len to point at the end of inp
500 vld1
.8
{@X[0]},[$inp]!
501 vld1
.8
{@X[1]},[$inp]!
502 vld1
.8
{@X[2]},[$inp]!
503 vld1
.8
{@X[3]},[$inp]!
504 vld1
.32
{$T0},[$Ktbl,:128]!
505 vld1
.32
{$T1},[$Ktbl,:128]!
506 vld1
.32
{$T2},[$Ktbl,:128]!
507 vld1
.32
{$T3},[$Ktbl,:128]!
508 vrev32
.8
@X[0],@X[0] @ yes
, even on
510 vrev32
.8
@X[1],@X[1] @ big
-endian
516 str
$t2,[sp
,#76] @ save original sp
517 vadd
.i32
$T0,$T0,@X[0]
518 vadd
.i32
$T1,$T1,@X[1]
519 vst1
.32
{$T0},[$Xfer,:128]!
520 vadd
.i32
$T2,$T2,@X[2]
521 vst1
.32
{$T1},[$Xfer,:128]!
522 vadd
.i32
$T3,$T3,@X[3]
523 vst1
.32
{$T2},[$Xfer,:128]!
524 vst1
.32
{$T3},[$Xfer,:128]!
536 &Xupdate
(\
&body_00_15
);
537 &Xupdate
(\
&body_00_15
);
538 &Xupdate
(\
&body_00_15
);
539 &Xupdate
(\
&body_00_15
);
541 teq
$t1,#0 @ check for K256 terminator
548 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
551 subeq
$inp,$inp,#64 @ avoid SEGV
552 vld1
.8
{@X[0]},[$inp]! @ load
next input block
553 vld1
.8
{@X[1]},[$inp]!
554 vld1
.8
{@X[2]},[$inp]!
555 vld1
.8
{@X[3]},[$inp]!
560 &Xpreload
(\
&body_00_15
);
561 &Xpreload
(\
&body_00_15
);
562 &Xpreload
(\
&body_00_15
);
563 &Xpreload
(\
&body_00_15
);
566 add
$A,$A,$t2 @ h
+=Maj
(a
,b
,c
) from the past
570 add
$A,$A,$t0 @ accumulate
592 ldreq sp
,[sp
,#76] @ restore original sp
597 ldmia sp
!,{r4
-r12
,pc
}
598 .size sha256_block_data_order_neon
,.-sha256_block_data_order_neon
602 ######################################################################
606 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
607 my @MSG=map("q$_",(8..11));
608 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
610 my $_byte = ($flavour =~ /win/ ?
"DCB" : ".byte");
613 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
615 # if defined(__thumb2__)
616 # define INST(a,b,c,d) $_byte c,d|0xc,a,b
618 # define INST(a,b,c,d) $_byte a,b,c,d
621 .type sha256_block_data_order_armv8
,%function
623 sha256_block_data_order_armv8
:
625 vld1
.32
{$ABCD,$EFGH},[$ctx]
626 sub $Ktbl,$Ktbl,#256+32
627 add
$len,$inp,$len,lsl
#6 @ len to point at the end of inp
632 vld1
.8
{@MSG[0]-@MSG[1]},[$inp]!
633 vld1
.8
{@MSG[2]-@MSG[3]},[$inp]!
634 vld1
.32
{$W0},[$Ktbl]!
635 vrev32
.8
@MSG[0],@MSG[0]
636 vrev32
.8
@MSG[1],@MSG[1]
637 vrev32
.8
@MSG[2],@MSG[2]
638 vrev32
.8
@MSG[3],@MSG[3]
639 vmov
$ABCD_SAVE,$ABCD @ offload
640 vmov
$EFGH_SAVE,$EFGH
643 for($i=0;$i<12;$i++) {
645 vld1
.32
{$W1},[$Ktbl]!
646 vadd
.i32
$W0,$W0,@MSG[0]
647 sha256su0
@MSG[0],@MSG[1]
649 sha256h
$ABCD,$EFGH,$W0
650 sha256h2
$EFGH,$abcd,$W0
651 sha256su1
@MSG[0],@MSG[2],@MSG[3]
653 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
656 vld1
.32
{$W1},[$Ktbl]!
657 vadd
.i32
$W0,$W0,@MSG[0]
659 sha256h
$ABCD,$EFGH,$W0
660 sha256h2
$EFGH,$abcd,$W0
662 vld1
.32
{$W0},[$Ktbl]!
663 vadd
.i32
$W1,$W1,@MSG[1]
665 sha256h
$ABCD,$EFGH,$W1
666 sha256h2
$EFGH,$abcd,$W1
668 vld1
.32
{$W1},[$Ktbl]
669 vadd
.i32
$W0,$W0,@MSG[2]
670 sub $Ktbl,$Ktbl,#256-16 @ rewind
672 sha256h
$ABCD,$EFGH,$W0
673 sha256h2
$EFGH,$abcd,$W0
675 vadd
.i32
$W1,$W1,@MSG[3]
677 sha256h
$ABCD,$EFGH,$W1
678 sha256h2
$EFGH,$abcd,$W1
680 vadd
.i32
$ABCD,$ABCD,$ABCD_SAVE
681 vadd
.i32
$EFGH,$EFGH,$EFGH_SAVE
685 vst1
.32
{$ABCD,$EFGH},[$ctx]
688 .size sha256_block_data_order_armv8
,.-sha256_block_data_order_armv8
693 .asciz
"SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
695 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
696 .extern OPENSSL_armcap_P
703 last if (!s/^#/@/ and !/^$/);
709 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
710 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
713 my ($mnemonic,$arg)=@_;
715 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
716 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
717 |(($2&7)<<17)|(($2&8)<<4)
718 |(($3&7)<<1) |(($3&8)<<2);
719 # since ARMv7 instructions are always encoded little-endian.
720 # correct solution is to use .inst directive, but older
721 # assemblers don't implement it:-(
722 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
723 $word&0xff,($word>>8)&0xff,
724 ($word>>16)&0xff,($word>>24)&0xff,
730 foreach (split($/,$code)) {
732 s/\`([^\`]*)\`/eval $1/geo;
734 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
736 s/\bret\b/bx lr/go or
737 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
742 close STDOUT
or die "error closing STDOUT: $!"; # enforce flush