]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/sha256-armv4.pl
Update copyright year
[thirdparty/openssl.git] / crypto / sha / asm / sha256-armv4.pl
1 #! /usr/bin/env perl
2 # Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 #
16 # Permission to use under GPL terms is granted.
17 # ====================================================================
18
19 # SHA256 block procedure for ARMv4. May 2007.
20
21 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
22 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23 # byte [on single-issue Xscale PXA250 core].
24
25 # July 2010.
26 #
27 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
28 # Cortex A8 core and ~20 cycles per processed byte.
29
30 # February 2011.
31 #
32 # Profiler-assisted and platform-specific optimization resulted in 16%
33 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35 # September 2013.
36 #
37 # Add NEON implementation. On Cortex A8 it was measured to process one
38 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40 # code (meaning that latter performs sub-optimally, nothing was done
41 # about it).
42
43 # May 2014.
44 #
45 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
47 # $output is the last argument if it looks like a file (it has an extension)
48 # $flavour is the first argument if it doesn't look like a file
49 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
51
52 if ($flavour && $flavour ne "void") {
53 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
55 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
56 die "can't locate arm-xlate.pl";
57
58 open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
59 or die "can't call $xlate: $!";
60 } else {
61 $output and open STDOUT,">$output";
62 }
63
64 $ctx="r0"; $t0="r0";
65 $inp="r1"; $t4="r1";
66 $len="r2"; $t1="r2";
67 $T1="r3"; $t3="r3";
68 $A="r4";
69 $B="r5";
70 $C="r6";
71 $D="r7";
72 $E="r8";
73 $F="r9";
74 $G="r10";
75 $H="r11";
76 @V=($A,$B,$C,$D,$E,$F,$G,$H);
77 $t2="r12";
78 $Ktbl="r14";
79
80 @Sigma0=( 2,13,22);
81 @Sigma1=( 6,11,25);
82 @sigma0=( 7,18, 3);
83 @sigma1=(17,19,10);
84
85 sub BODY_00_15 {
86 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
87
88 $code.=<<___ if ($i<16);
89 #if __ARM_ARCH__>=7
90 @ ldr $t1,[$inp],#4 @ $i
91 # if $i==15
92 str $inp,[sp,#17*4] @ make room for $t4
93 # endif
94 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
95 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
96 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
97 # ifndef __ARMEB__
98 rev $t1,$t1
99 # endif
100 #else
101 @ ldrb $t1,[$inp,#3] @ $i
102 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
103 ldrb $t2,[$inp,#2]
104 ldrb $t0,[$inp,#1]
105 orr $t1,$t1,$t2,lsl#8
106 ldrb $t2,[$inp],#4
107 orr $t1,$t1,$t0,lsl#16
108 # if $i==15
109 str $inp,[sp,#17*4] @ make room for $t4
110 # endif
111 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
112 orr $t1,$t1,$t2,lsl#24
113 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
114 #endif
115 ___
116 $code.=<<___;
117 ldr $t2,[$Ktbl],#4 @ *K256++
118 add $h,$h,$t1 @ h+=X[i]
119 str $t1,[sp,#`$i%16`*4]
120 eor $t1,$f,$g
121 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
122 and $t1,$t1,$e
123 add $h,$h,$t2 @ h+=K256[i]
124 eor $t1,$t1,$g @ Ch(e,f,g)
125 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
126 add $h,$h,$t1 @ h+=Ch(e,f,g)
127 #if $i==31
128 and $t2,$t2,#0xff
129 cmp $t2,#0xf2 @ done?
130 #endif
131 #if $i<15
132 # if __ARM_ARCH__>=7
133 ldr $t1,[$inp],#4 @ prefetch
134 # else
135 ldrb $t1,[$inp,#3]
136 # endif
137 eor $t2,$a,$b @ a^b, b^c in next round
138 #else
139 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
140 eor $t2,$a,$b @ a^b, b^c in next round
141 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
142 #endif
143 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
144 and $t3,$t3,$t2 @ (b^c)&=(a^b)
145 add $d,$d,$h @ d+=h
146 eor $t3,$t3,$b @ Maj(a,b,c)
147 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
148 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
149 ___
150 ($t2,$t3)=($t3,$t2);
151 }
152
153 sub BODY_16_XX {
154 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
155
156 $code.=<<___;
157 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
158 @ ldr $t4,[sp,#`($i+14)%16`*4]
159 mov $t0,$t1,ror#$sigma0[0]
160 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
161 mov $t2,$t4,ror#$sigma1[0]
162 eor $t0,$t0,$t1,ror#$sigma0[1]
163 eor $t2,$t2,$t4,ror#$sigma1[1]
164 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
165 ldr $t1,[sp,#`($i+0)%16`*4]
166 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
167 ldr $t4,[sp,#`($i+9)%16`*4]
168
169 add $t2,$t2,$t0
170 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
171 add $t1,$t1,$t2
172 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
173 add $t1,$t1,$t4 @ X[i]
174 ___
175 &BODY_00_15(@_);
176 }
177
178 $code=<<___;
179 #ifndef __KERNEL__
180 # include "arm_arch.h"
181 #else
182 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
183 # define __ARM_MAX_ARCH__ 7
184 #endif
185
186 #if defined(__thumb2__)
187 .syntax unified
188 .thumb
189 #else
190 .code 32
191 #endif
192
193 .text
194
195 .type K256,%object
196 .align 5
197 K256:
198 .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
199 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
200 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
201 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
202 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
203 .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
204 .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
205 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
206 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
207 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
208 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
209 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
210 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
211 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
212 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
213 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
214 .size K256,.-K256
215 .word 0 @ terminator
216 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
217 .LOPENSSL_armcap:
218 # ifdef _WIN32
219 .word OPENSSL_armcap_P
220 # else
221 .word OPENSSL_armcap_P-.Lsha256_block_data_order
222 # endif
223 #endif
224 .align 5
225
226 .global sha256_block_data_order
227 .type sha256_block_data_order,%function
228 sha256_block_data_order:
229 .Lsha256_block_data_order:
230 #if __ARM_ARCH__<7 && !defined(__thumb2__)
231 sub r3,pc,#8 @ sha256_block_data_order
232 #else
233 adr r3,.Lsha256_block_data_order
234 #endif
235 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
236 ldr r12,.LOPENSSL_armcap
237 # if !defined(_WIN32)
238 ldr r12,[r3,r12] @ OPENSSL_armcap_P
239 # endif
240 # if defined(__APPLE__) || defined(_WIN32)
241 ldr r12,[r12]
242 # endif
243 tst r12,#ARMV8_SHA256
244 bne .LARMv8
245 tst r12,#ARMV7_NEON
246 bne .LNEON
247 #endif
248 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
249 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
250 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
251 sub $Ktbl,r3,#256+32 @ K256
252 sub sp,sp,#16*4 @ alloca(X[16])
253 .Loop:
254 # if __ARM_ARCH__>=7
255 ldr $t1,[$inp],#4
256 # else
257 ldrb $t1,[$inp,#3]
258 # endif
259 eor $t3,$B,$C @ magic
260 eor $t2,$t2,$t2
261 ___
262 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
263 $code.=".Lrounds_16_xx:\n";
264 for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
265 $code.=<<___;
266 #ifdef __thumb2__
267 ite eq @ Thumb2 thing, sanity check in ARM
268 #endif
269 ldreq $t3,[sp,#16*4] @ pull ctx
270 bne .Lrounds_16_xx
271
272 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
273 ldr $t0,[$t3,#0]
274 ldr $t1,[$t3,#4]
275 ldr $t2,[$t3,#8]
276 add $A,$A,$t0
277 ldr $t0,[$t3,#12]
278 add $B,$B,$t1
279 ldr $t1,[$t3,#16]
280 add $C,$C,$t2
281 ldr $t2,[$t3,#20]
282 add $D,$D,$t0
283 ldr $t0,[$t3,#24]
284 add $E,$E,$t1
285 ldr $t1,[$t3,#28]
286 add $F,$F,$t2
287 ldr $inp,[sp,#17*4] @ pull inp
288 ldr $t2,[sp,#18*4] @ pull inp+len
289 add $G,$G,$t0
290 add $H,$H,$t1
291 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
292 cmp $inp,$t2
293 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
294 bne .Loop
295
296 add sp,sp,#`16+3`*4 @ destroy frame
297 #if __ARM_ARCH__>=5
298 ldmia sp!,{r4-r11,pc}
299 #else
300 ldmia sp!,{r4-r11,lr}
301 tst lr,#1
302 moveq pc,lr @ be binary compatible with V4, yet
303 bx lr @ interoperable with Thumb ISA:-)
304 #endif
305 .size sha256_block_data_order,.-sha256_block_data_order
306 ___
307 ######################################################################
308 # NEON stuff
309 #
310 {{{
311 my @X=map("q$_",(0..3));
312 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
313 my $Xfer=$t4;
314 my $j=0;
315
316 sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
317 sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
318
319 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
320 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
321 my $arg = pop;
322 $arg = "#$arg" if ($arg*1 eq $arg);
323 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
324 }
325
326 sub Xupdate()
327 { use integer;
328 my $body = shift;
329 my @insns = (&$body,&$body,&$body,&$body);
330 my ($a,$b,$c,$d,$e,$f,$g,$h);
331
332 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
333 eval(shift(@insns));
334 eval(shift(@insns));
335 eval(shift(@insns));
336 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
337 eval(shift(@insns));
338 eval(shift(@insns));
339 eval(shift(@insns));
340 &vshr_u32 ($T2,$T0,$sigma0[0]);
341 eval(shift(@insns));
342 eval(shift(@insns));
343 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
344 eval(shift(@insns));
345 eval(shift(@insns));
346 &vshr_u32 ($T1,$T0,$sigma0[2]);
347 eval(shift(@insns));
348 eval(shift(@insns));
349 &vsli_32 ($T2,$T0,32-$sigma0[0]);
350 eval(shift(@insns));
351 eval(shift(@insns));
352 &vshr_u32 ($T3,$T0,$sigma0[1]);
353 eval(shift(@insns));
354 eval(shift(@insns));
355 &veor ($T1,$T1,$T2);
356 eval(shift(@insns));
357 eval(shift(@insns));
358 &vsli_32 ($T3,$T0,32-$sigma0[1]);
359 eval(shift(@insns));
360 eval(shift(@insns));
361 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
362 eval(shift(@insns));
363 eval(shift(@insns));
364 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
365 eval(shift(@insns));
366 eval(shift(@insns));
367 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
368 eval(shift(@insns));
369 eval(shift(@insns));
370 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
371 eval(shift(@insns));
372 eval(shift(@insns));
373 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
374 eval(shift(@insns));
375 eval(shift(@insns));
376 &veor ($T5,$T5,$T4);
377 eval(shift(@insns));
378 eval(shift(@insns));
379 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
380 eval(shift(@insns));
381 eval(shift(@insns));
382 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
383 eval(shift(@insns));
384 eval(shift(@insns));
385 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
386 eval(shift(@insns));
387 eval(shift(@insns));
388 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
389 eval(shift(@insns));
390 eval(shift(@insns));
391 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
392 eval(shift(@insns));
393 eval(shift(@insns));
394 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
395 eval(shift(@insns));
396 eval(shift(@insns));
397 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
398 eval(shift(@insns));
399 eval(shift(@insns));
400 &veor ($T5,$T5,$T4);
401 eval(shift(@insns));
402 eval(shift(@insns));
403 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
404 eval(shift(@insns));
405 eval(shift(@insns));
406 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
407 eval(shift(@insns));
408 eval(shift(@insns));
409 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
410 eval(shift(@insns));
411 eval(shift(@insns));
412 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
413 eval(shift(@insns));
414 eval(shift(@insns));
415 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
416 eval(shift(@insns));
417 eval(shift(@insns));
418 &vadd_i32 ($T0,$T0,@X[0]);
419 while($#insns>=2) { eval(shift(@insns)); }
420 &vst1_32 ("{$T0}","[$Xfer,:128]!");
421 eval(shift(@insns));
422 eval(shift(@insns));
423
424 push(@X,shift(@X)); # "rotate" X[]
425 }
426
427 sub Xpreload()
428 { use integer;
429 my $body = shift;
430 my @insns = (&$body,&$body,&$body,&$body);
431 my ($a,$b,$c,$d,$e,$f,$g,$h);
432
433 eval(shift(@insns));
434 eval(shift(@insns));
435 eval(shift(@insns));
436 eval(shift(@insns));
437 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
438 eval(shift(@insns));
439 eval(shift(@insns));
440 eval(shift(@insns));
441 eval(shift(@insns));
442 &vrev32_8 (@X[0],@X[0]);
443 eval(shift(@insns));
444 eval(shift(@insns));
445 eval(shift(@insns));
446 eval(shift(@insns));
447 &vadd_i32 ($T0,$T0,@X[0]);
448 foreach (@insns) { eval; } # remaining instructions
449 &vst1_32 ("{$T0}","[$Xfer,:128]!");
450
451 push(@X,shift(@X)); # "rotate" X[]
452 }
453
454 sub body_00_15 () {
455 (
456 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
457 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
458 '&eor ($t1,$f,$g)',
459 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
460 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
461 '&and ($t1,$t1,$e)',
462 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
463 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
464 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
465 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
466 '&eor ($t2,$a,$b)', # a^b, b^c in next round
467 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
468 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
469 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
470 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
471 '&ldr ($t1,"[sp,#64]") if ($j==31)',
472 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
473 '&add ($d,$d,$h)', # d+=h
474 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
475 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
476 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
477 )
478 }
479
480 $code.=<<___;
481 #if __ARM_MAX_ARCH__>=7
482 .arch armv7-a
483 .fpu neon
484
485 .global sha256_block_data_order_neon
486 .type sha256_block_data_order_neon,%function
487 .align 5
488 .skip 16
489 sha256_block_data_order_neon:
490 .LNEON:
491 stmdb sp!,{r4-r12,lr}
492
493 sub $H,sp,#16*4+16
494 adr $Ktbl,K256
495 bic $H,$H,#15 @ align for 128-bit stores
496 mov $t2,sp
497 mov sp,$H @ alloca
498 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
499
500 vld1.8 {@X[0]},[$inp]!
501 vld1.8 {@X[1]},[$inp]!
502 vld1.8 {@X[2]},[$inp]!
503 vld1.8 {@X[3]},[$inp]!
504 vld1.32 {$T0},[$Ktbl,:128]!
505 vld1.32 {$T1},[$Ktbl,:128]!
506 vld1.32 {$T2},[$Ktbl,:128]!
507 vld1.32 {$T3},[$Ktbl,:128]!
508 vrev32.8 @X[0],@X[0] @ yes, even on
509 str $ctx,[sp,#64]
510 vrev32.8 @X[1],@X[1] @ big-endian
511 str $inp,[sp,#68]
512 mov $Xfer,sp
513 vrev32.8 @X[2],@X[2]
514 str $len,[sp,#72]
515 vrev32.8 @X[3],@X[3]
516 str $t2,[sp,#76] @ save original sp
517 vadd.i32 $T0,$T0,@X[0]
518 vadd.i32 $T1,$T1,@X[1]
519 vst1.32 {$T0},[$Xfer,:128]!
520 vadd.i32 $T2,$T2,@X[2]
521 vst1.32 {$T1},[$Xfer,:128]!
522 vadd.i32 $T3,$T3,@X[3]
523 vst1.32 {$T2},[$Xfer,:128]!
524 vst1.32 {$T3},[$Xfer,:128]!
525
526 ldmia $ctx,{$A-$H}
527 sub $Xfer,$Xfer,#64
528 ldr $t1,[sp,#0]
529 eor $t2,$t2,$t2
530 eor $t3,$B,$C
531 b .L_00_48
532
533 .align 4
534 .L_00_48:
535 ___
536 &Xupdate(\&body_00_15);
537 &Xupdate(\&body_00_15);
538 &Xupdate(\&body_00_15);
539 &Xupdate(\&body_00_15);
540 $code.=<<___;
541 teq $t1,#0 @ check for K256 terminator
542 ldr $t1,[sp,#0]
543 sub $Xfer,$Xfer,#64
544 bne .L_00_48
545
546 ldr $inp,[sp,#68]
547 ldr $t0,[sp,#72]
548 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
549 teq $inp,$t0
550 it eq
551 subeq $inp,$inp,#64 @ avoid SEGV
552 vld1.8 {@X[0]},[$inp]! @ load next input block
553 vld1.8 {@X[1]},[$inp]!
554 vld1.8 {@X[2]},[$inp]!
555 vld1.8 {@X[3]},[$inp]!
556 it ne
557 strne $inp,[sp,#68]
558 mov $Xfer,sp
559 ___
560 &Xpreload(\&body_00_15);
561 &Xpreload(\&body_00_15);
562 &Xpreload(\&body_00_15);
563 &Xpreload(\&body_00_15);
564 $code.=<<___;
565 ldr $t0,[$t1,#0]
566 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
567 ldr $t2,[$t1,#4]
568 ldr $t3,[$t1,#8]
569 ldr $t4,[$t1,#12]
570 add $A,$A,$t0 @ accumulate
571 ldr $t0,[$t1,#16]
572 add $B,$B,$t2
573 ldr $t2,[$t1,#20]
574 add $C,$C,$t3
575 ldr $t3,[$t1,#24]
576 add $D,$D,$t4
577 ldr $t4,[$t1,#28]
578 add $E,$E,$t0
579 str $A,[$t1],#4
580 add $F,$F,$t2
581 str $B,[$t1],#4
582 add $G,$G,$t3
583 str $C,[$t1],#4
584 add $H,$H,$t4
585 str $D,[$t1],#4
586 stmia $t1,{$E-$H}
587
588 ittte ne
589 movne $Xfer,sp
590 ldrne $t1,[sp,#0]
591 eorne $t2,$t2,$t2
592 ldreq sp,[sp,#76] @ restore original sp
593 itt ne
594 eorne $t3,$B,$C
595 bne .L_00_48
596
597 ldmia sp!,{r4-r12,pc}
598 .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
599 #endif
600 ___
601 }}}
602 ######################################################################
603 # ARMv8 stuff
604 #
605 {{{
606 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
607 my @MSG=map("q$_",(8..11));
608 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
609 my $Ktbl="r3";
610 my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
611
612 $code.=<<___;
613 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
614
615 # if defined(__thumb2__)
616 # define INST(a,b,c,d) $_byte c,d|0xc,a,b
617 # else
618 # define INST(a,b,c,d) $_byte a,b,c,d
619 # endif
620
621 .type sha256_block_data_order_armv8,%function
622 .align 5
623 sha256_block_data_order_armv8:
624 .LARMv8:
625 vld1.32 {$ABCD,$EFGH},[$ctx]
626 sub $Ktbl,$Ktbl,#256+32
627 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
628 b .Loop_v8
629
630 .align 4
631 .Loop_v8:
632 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
633 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
634 vld1.32 {$W0},[$Ktbl]!
635 vrev32.8 @MSG[0],@MSG[0]
636 vrev32.8 @MSG[1],@MSG[1]
637 vrev32.8 @MSG[2],@MSG[2]
638 vrev32.8 @MSG[3],@MSG[3]
639 vmov $ABCD_SAVE,$ABCD @ offload
640 vmov $EFGH_SAVE,$EFGH
641 teq $inp,$len
642 ___
643 for($i=0;$i<12;$i++) {
644 $code.=<<___;
645 vld1.32 {$W1},[$Ktbl]!
646 vadd.i32 $W0,$W0,@MSG[0]
647 sha256su0 @MSG[0],@MSG[1]
648 vmov $abcd,$ABCD
649 sha256h $ABCD,$EFGH,$W0
650 sha256h2 $EFGH,$abcd,$W0
651 sha256su1 @MSG[0],@MSG[2],@MSG[3]
652 ___
653 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
654 }
655 $code.=<<___;
656 vld1.32 {$W1},[$Ktbl]!
657 vadd.i32 $W0,$W0,@MSG[0]
658 vmov $abcd,$ABCD
659 sha256h $ABCD,$EFGH,$W0
660 sha256h2 $EFGH,$abcd,$W0
661
662 vld1.32 {$W0},[$Ktbl]!
663 vadd.i32 $W1,$W1,@MSG[1]
664 vmov $abcd,$ABCD
665 sha256h $ABCD,$EFGH,$W1
666 sha256h2 $EFGH,$abcd,$W1
667
668 vld1.32 {$W1},[$Ktbl]
669 vadd.i32 $W0,$W0,@MSG[2]
670 sub $Ktbl,$Ktbl,#256-16 @ rewind
671 vmov $abcd,$ABCD
672 sha256h $ABCD,$EFGH,$W0
673 sha256h2 $EFGH,$abcd,$W0
674
675 vadd.i32 $W1,$W1,@MSG[3]
676 vmov $abcd,$ABCD
677 sha256h $ABCD,$EFGH,$W1
678 sha256h2 $EFGH,$abcd,$W1
679
680 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
681 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
682 it ne
683 bne .Loop_v8
684
685 vst1.32 {$ABCD,$EFGH},[$ctx]
686
687 ret @ bx lr
688 .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
689 #endif
690 ___
691 }}}
692 $code.=<<___;
693 .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
694 .align 2
695 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
696 .comm OPENSSL_armcap_P,4,4
697 #endif
698 ___
699
700 open SELF,$0;
701 while(<SELF>) {
702 next if (/^#!/);
703 last if (!s/^#/@/ and !/^$/);
704 print;
705 }
706 close SELF;
707
708 { my %opcode = (
709 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
710 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
711
712 sub unsha256 {
713 my ($mnemonic,$arg)=@_;
714
715 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
716 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
717 |(($2&7)<<17)|(($2&8)<<4)
718 |(($3&7)<<1) |(($3&8)<<2);
719 # since ARMv7 instructions are always encoded little-endian.
720 # correct solution is to use .inst directive, but older
721 # assemblers don't implement it:-(
722 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
723 $word&0xff,($word>>8)&0xff,
724 ($word>>16)&0xff,($word>>24)&0xff,
725 $mnemonic,$arg;
726 }
727 }
728 }
729
730 foreach (split($/,$code)) {
731
732 s/\`([^\`]*)\`/eval $1/geo;
733
734 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
735
736 s/\bret\b/bx lr/go or
737 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
738
739 print $_,"\n";
740 }
741
742 close STDOUT or die "error closing STDOUT: $!"; # enforce flush