]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/sha256-armv4.pl
Update copyright year
[thirdparty/openssl.git] / crypto / sha / asm / sha256-armv4.pl
1 #! /usr/bin/env perl
2 # Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 #
16 # Permission to use under GPL terms is granted.
17 # ====================================================================
18
19 # SHA256 block procedure for ARMv4. May 2007.
20
21 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
22 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23 # byte [on single-issue Xscale PXA250 core].
24
25 # July 2010.
26 #
27 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
28 # Cortex A8 core and ~20 cycles per processed byte.
29
30 # February 2011.
31 #
32 # Profiler-assisted and platform-specific optimization resulted in 16%
33 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35 # September 2013.
36 #
37 # Add NEON implementation. On Cortex A8 it was measured to process one
38 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40 # code (meaning that latter performs sub-optimally, nothing was done
41 # about it).
42
43 # May 2014.
44 #
45 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
47 $flavour = shift;
48 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
49 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
50
51 if ($flavour && $flavour ne "void") {
52 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
54 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
55 die "can't locate arm-xlate.pl";
56
57 open STDOUT,"| \"$^X\" $xlate $flavour $output";
58 } else {
59 open STDOUT,">$output";
60 }
61
62 $ctx="r0"; $t0="r0";
63 $inp="r1"; $t4="r1";
64 $len="r2"; $t1="r2";
65 $T1="r3"; $t3="r3";
66 $A="r4";
67 $B="r5";
68 $C="r6";
69 $D="r7";
70 $E="r8";
71 $F="r9";
72 $G="r10";
73 $H="r11";
74 @V=($A,$B,$C,$D,$E,$F,$G,$H);
75 $t2="r12";
76 $Ktbl="r14";
77
78 @Sigma0=( 2,13,22);
79 @Sigma1=( 6,11,25);
80 @sigma0=( 7,18, 3);
81 @sigma1=(17,19,10);
82
83 sub BODY_00_15 {
84 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
85
86 $code.=<<___ if ($i<16);
87 #if __ARM_ARCH__>=7
88 @ ldr $t1,[$inp],#4 @ $i
89 # if $i==15
90 str $inp,[sp,#17*4] @ make room for $t4
91 # endif
92 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
93 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
94 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
95 # ifndef __ARMEB__
96 rev $t1,$t1
97 # endif
98 #else
99 @ ldrb $t1,[$inp,#3] @ $i
100 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
101 ldrb $t2,[$inp,#2]
102 ldrb $t0,[$inp,#1]
103 orr $t1,$t1,$t2,lsl#8
104 ldrb $t2,[$inp],#4
105 orr $t1,$t1,$t0,lsl#16
106 # if $i==15
107 str $inp,[sp,#17*4] @ make room for $t4
108 # endif
109 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
110 orr $t1,$t1,$t2,lsl#24
111 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
112 #endif
113 ___
114 $code.=<<___;
115 ldr $t2,[$Ktbl],#4 @ *K256++
116 add $h,$h,$t1 @ h+=X[i]
117 str $t1,[sp,#`$i%16`*4]
118 eor $t1,$f,$g
119 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
120 and $t1,$t1,$e
121 add $h,$h,$t2 @ h+=K256[i]
122 eor $t1,$t1,$g @ Ch(e,f,g)
123 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
124 add $h,$h,$t1 @ h+=Ch(e,f,g)
125 #if $i==31
126 and $t2,$t2,#0xff
127 cmp $t2,#0xf2 @ done?
128 #endif
129 #if $i<15
130 # if __ARM_ARCH__>=7
131 ldr $t1,[$inp],#4 @ prefetch
132 # else
133 ldrb $t1,[$inp,#3]
134 # endif
135 eor $t2,$a,$b @ a^b, b^c in next round
136 #else
137 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
138 eor $t2,$a,$b @ a^b, b^c in next round
139 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
140 #endif
141 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
142 and $t3,$t3,$t2 @ (b^c)&=(a^b)
143 add $d,$d,$h @ d+=h
144 eor $t3,$t3,$b @ Maj(a,b,c)
145 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
146 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
147 ___
148 ($t2,$t3)=($t3,$t2);
149 }
150
151 sub BODY_16_XX {
152 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
153
154 $code.=<<___;
155 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
156 @ ldr $t4,[sp,#`($i+14)%16`*4]
157 mov $t0,$t1,ror#$sigma0[0]
158 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
159 mov $t2,$t4,ror#$sigma1[0]
160 eor $t0,$t0,$t1,ror#$sigma0[1]
161 eor $t2,$t2,$t4,ror#$sigma1[1]
162 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
163 ldr $t1,[sp,#`($i+0)%16`*4]
164 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
165 ldr $t4,[sp,#`($i+9)%16`*4]
166
167 add $t2,$t2,$t0
168 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
169 add $t1,$t1,$t2
170 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
171 add $t1,$t1,$t4 @ X[i]
172 ___
173 &BODY_00_15(@_);
174 }
175
176 $code=<<___;
177 #ifndef __KERNEL__
178 # include "arm_arch.h"
179 #else
180 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
181 # define __ARM_MAX_ARCH__ 7
182 #endif
183
184 .text
185 #if defined(__thumb2__)
186 .syntax unified
187 .thumb
188 #else
189 .code 32
190 #endif
191
192 .type K256,%object
193 .align 5
194 K256:
195 .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
196 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
197 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
198 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
199 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
200 .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
201 .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
202 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
203 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
204 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
205 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
206 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
207 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
208 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
209 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
210 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
211 .size K256,.-K256
212 .word 0 @ terminator
213 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
214 .LOPENSSL_armcap:
215 .word OPENSSL_armcap_P-.Lsha256_block_data_order
216 #endif
217 .align 5
218
219 .global sha256_block_data_order
220 .type sha256_block_data_order,%function
221 sha256_block_data_order:
222 .Lsha256_block_data_order:
223 #if __ARM_ARCH__<7 && !defined(__thumb2__)
224 sub r3,pc,#8 @ sha256_block_data_order
225 #else
226 adr r3,.Lsha256_block_data_order
227 #endif
228 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
229 ldr r12,.LOPENSSL_armcap
230 ldr r12,[r3,r12] @ OPENSSL_armcap_P
231 #ifdef __APPLE__
232 ldr r12,[r12]
233 #endif
234 tst r12,#ARMV8_SHA256
235 bne .LARMv8
236 tst r12,#ARMV7_NEON
237 bne .LNEON
238 #endif
239 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
240 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
241 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
242 sub $Ktbl,r3,#256+32 @ K256
243 sub sp,sp,#16*4 @ alloca(X[16])
244 .Loop:
245 # if __ARM_ARCH__>=7
246 ldr $t1,[$inp],#4
247 # else
248 ldrb $t1,[$inp,#3]
249 # endif
250 eor $t3,$B,$C @ magic
251 eor $t2,$t2,$t2
252 ___
253 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
254 $code.=".Lrounds_16_xx:\n";
255 for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
256 $code.=<<___;
257 #ifdef __thumb2__
258 ite eq @ Thumb2 thing, sanity check in ARM
259 #endif
260 ldreq $t3,[sp,#16*4] @ pull ctx
261 bne .Lrounds_16_xx
262
263 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
264 ldr $t0,[$t3,#0]
265 ldr $t1,[$t3,#4]
266 ldr $t2,[$t3,#8]
267 add $A,$A,$t0
268 ldr $t0,[$t3,#12]
269 add $B,$B,$t1
270 ldr $t1,[$t3,#16]
271 add $C,$C,$t2
272 ldr $t2,[$t3,#20]
273 add $D,$D,$t0
274 ldr $t0,[$t3,#24]
275 add $E,$E,$t1
276 ldr $t1,[$t3,#28]
277 add $F,$F,$t2
278 ldr $inp,[sp,#17*4] @ pull inp
279 ldr $t2,[sp,#18*4] @ pull inp+len
280 add $G,$G,$t0
281 add $H,$H,$t1
282 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
283 cmp $inp,$t2
284 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
285 bne .Loop
286
287 add sp,sp,#`16+3`*4 @ destroy frame
288 #if __ARM_ARCH__>=5
289 ldmia sp!,{r4-r11,pc}
290 #else
291 ldmia sp!,{r4-r11,lr}
292 tst lr,#1
293 moveq pc,lr @ be binary compatible with V4, yet
294 bx lr @ interoperable with Thumb ISA:-)
295 #endif
296 .size sha256_block_data_order,.-sha256_block_data_order
297 ___
298 ######################################################################
299 # NEON stuff
300 #
301 {{{
302 my @X=map("q$_",(0..3));
303 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
304 my $Xfer=$t4;
305 my $j=0;
306
307 sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
308 sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
309
310 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
311 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
312 my $arg = pop;
313 $arg = "#$arg" if ($arg*1 eq $arg);
314 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
315 }
316
317 sub Xupdate()
318 { use integer;
319 my $body = shift;
320 my @insns = (&$body,&$body,&$body,&$body);
321 my ($a,$b,$c,$d,$e,$f,$g,$h);
322
323 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
324 eval(shift(@insns));
325 eval(shift(@insns));
326 eval(shift(@insns));
327 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
328 eval(shift(@insns));
329 eval(shift(@insns));
330 eval(shift(@insns));
331 &vshr_u32 ($T2,$T0,$sigma0[0]);
332 eval(shift(@insns));
333 eval(shift(@insns));
334 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
335 eval(shift(@insns));
336 eval(shift(@insns));
337 &vshr_u32 ($T1,$T0,$sigma0[2]);
338 eval(shift(@insns));
339 eval(shift(@insns));
340 &vsli_32 ($T2,$T0,32-$sigma0[0]);
341 eval(shift(@insns));
342 eval(shift(@insns));
343 &vshr_u32 ($T3,$T0,$sigma0[1]);
344 eval(shift(@insns));
345 eval(shift(@insns));
346 &veor ($T1,$T1,$T2);
347 eval(shift(@insns));
348 eval(shift(@insns));
349 &vsli_32 ($T3,$T0,32-$sigma0[1]);
350 eval(shift(@insns));
351 eval(shift(@insns));
352 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
353 eval(shift(@insns));
354 eval(shift(@insns));
355 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
356 eval(shift(@insns));
357 eval(shift(@insns));
358 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
359 eval(shift(@insns));
360 eval(shift(@insns));
361 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
362 eval(shift(@insns));
363 eval(shift(@insns));
364 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
365 eval(shift(@insns));
366 eval(shift(@insns));
367 &veor ($T5,$T5,$T4);
368 eval(shift(@insns));
369 eval(shift(@insns));
370 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
371 eval(shift(@insns));
372 eval(shift(@insns));
373 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
374 eval(shift(@insns));
375 eval(shift(@insns));
376 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
377 eval(shift(@insns));
378 eval(shift(@insns));
379 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
380 eval(shift(@insns));
381 eval(shift(@insns));
382 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
383 eval(shift(@insns));
384 eval(shift(@insns));
385 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
386 eval(shift(@insns));
387 eval(shift(@insns));
388 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
389 eval(shift(@insns));
390 eval(shift(@insns));
391 &veor ($T5,$T5,$T4);
392 eval(shift(@insns));
393 eval(shift(@insns));
394 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
395 eval(shift(@insns));
396 eval(shift(@insns));
397 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
398 eval(shift(@insns));
399 eval(shift(@insns));
400 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
401 eval(shift(@insns));
402 eval(shift(@insns));
403 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
404 eval(shift(@insns));
405 eval(shift(@insns));
406 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
407 eval(shift(@insns));
408 eval(shift(@insns));
409 &vadd_i32 ($T0,$T0,@X[0]);
410 while($#insns>=2) { eval(shift(@insns)); }
411 &vst1_32 ("{$T0}","[$Xfer,:128]!");
412 eval(shift(@insns));
413 eval(shift(@insns));
414
415 push(@X,shift(@X)); # "rotate" X[]
416 }
417
418 sub Xpreload()
419 { use integer;
420 my $body = shift;
421 my @insns = (&$body,&$body,&$body,&$body);
422 my ($a,$b,$c,$d,$e,$f,$g,$h);
423
424 eval(shift(@insns));
425 eval(shift(@insns));
426 eval(shift(@insns));
427 eval(shift(@insns));
428 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
429 eval(shift(@insns));
430 eval(shift(@insns));
431 eval(shift(@insns));
432 eval(shift(@insns));
433 &vrev32_8 (@X[0],@X[0]);
434 eval(shift(@insns));
435 eval(shift(@insns));
436 eval(shift(@insns));
437 eval(shift(@insns));
438 &vadd_i32 ($T0,$T0,@X[0]);
439 foreach (@insns) { eval; } # remaining instructions
440 &vst1_32 ("{$T0}","[$Xfer,:128]!");
441
442 push(@X,shift(@X)); # "rotate" X[]
443 }
444
445 sub body_00_15 () {
446 (
447 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
448 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
449 '&eor ($t1,$f,$g)',
450 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
451 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
452 '&and ($t1,$t1,$e)',
453 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
454 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
455 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
456 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
457 '&eor ($t2,$a,$b)', # a^b, b^c in next round
458 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
459 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
460 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
461 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
462 '&ldr ($t1,"[sp,#64]") if ($j==31)',
463 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
464 '&add ($d,$d,$h)', # d+=h
465 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
466 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
467 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
468 )
469 }
470
471 $code.=<<___;
472 #if __ARM_MAX_ARCH__>=7
473 .arch armv7-a
474 .fpu neon
475
476 .global sha256_block_data_order_neon
477 .type sha256_block_data_order_neon,%function
478 .align 5
479 .skip 16
480 sha256_block_data_order_neon:
481 .LNEON:
482 stmdb sp!,{r4-r12,lr}
483
484 sub $H,sp,#16*4+16
485 adr $Ktbl,K256
486 bic $H,$H,#15 @ align for 128-bit stores
487 mov $t2,sp
488 mov sp,$H @ alloca
489 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
490
491 vld1.8 {@X[0]},[$inp]!
492 vld1.8 {@X[1]},[$inp]!
493 vld1.8 {@X[2]},[$inp]!
494 vld1.8 {@X[3]},[$inp]!
495 vld1.32 {$T0},[$Ktbl,:128]!
496 vld1.32 {$T1},[$Ktbl,:128]!
497 vld1.32 {$T2},[$Ktbl,:128]!
498 vld1.32 {$T3},[$Ktbl,:128]!
499 vrev32.8 @X[0],@X[0] @ yes, even on
500 str $ctx,[sp,#64]
501 vrev32.8 @X[1],@X[1] @ big-endian
502 str $inp,[sp,#68]
503 mov $Xfer,sp
504 vrev32.8 @X[2],@X[2]
505 str $len,[sp,#72]
506 vrev32.8 @X[3],@X[3]
507 str $t2,[sp,#76] @ save original sp
508 vadd.i32 $T0,$T0,@X[0]
509 vadd.i32 $T1,$T1,@X[1]
510 vst1.32 {$T0},[$Xfer,:128]!
511 vadd.i32 $T2,$T2,@X[2]
512 vst1.32 {$T1},[$Xfer,:128]!
513 vadd.i32 $T3,$T3,@X[3]
514 vst1.32 {$T2},[$Xfer,:128]!
515 vst1.32 {$T3},[$Xfer,:128]!
516
517 ldmia $ctx,{$A-$H}
518 sub $Xfer,$Xfer,#64
519 ldr $t1,[sp,#0]
520 eor $t2,$t2,$t2
521 eor $t3,$B,$C
522 b .L_00_48
523
524 .align 4
525 .L_00_48:
526 ___
527 &Xupdate(\&body_00_15);
528 &Xupdate(\&body_00_15);
529 &Xupdate(\&body_00_15);
530 &Xupdate(\&body_00_15);
531 $code.=<<___;
532 teq $t1,#0 @ check for K256 terminator
533 ldr $t1,[sp,#0]
534 sub $Xfer,$Xfer,#64
535 bne .L_00_48
536
537 ldr $inp,[sp,#68]
538 ldr $t0,[sp,#72]
539 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
540 teq $inp,$t0
541 it eq
542 subeq $inp,$inp,#64 @ avoid SEGV
543 vld1.8 {@X[0]},[$inp]! @ load next input block
544 vld1.8 {@X[1]},[$inp]!
545 vld1.8 {@X[2]},[$inp]!
546 vld1.8 {@X[3]},[$inp]!
547 it ne
548 strne $inp,[sp,#68]
549 mov $Xfer,sp
550 ___
551 &Xpreload(\&body_00_15);
552 &Xpreload(\&body_00_15);
553 &Xpreload(\&body_00_15);
554 &Xpreload(\&body_00_15);
555 $code.=<<___;
556 ldr $t0,[$t1,#0]
557 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
558 ldr $t2,[$t1,#4]
559 ldr $t3,[$t1,#8]
560 ldr $t4,[$t1,#12]
561 add $A,$A,$t0 @ accumulate
562 ldr $t0,[$t1,#16]
563 add $B,$B,$t2
564 ldr $t2,[$t1,#20]
565 add $C,$C,$t3
566 ldr $t3,[$t1,#24]
567 add $D,$D,$t4
568 ldr $t4,[$t1,#28]
569 add $E,$E,$t0
570 str $A,[$t1],#4
571 add $F,$F,$t2
572 str $B,[$t1],#4
573 add $G,$G,$t3
574 str $C,[$t1],#4
575 add $H,$H,$t4
576 str $D,[$t1],#4
577 stmia $t1,{$E-$H}
578
579 ittte ne
580 movne $Xfer,sp
581 ldrne $t1,[sp,#0]
582 eorne $t2,$t2,$t2
583 ldreq sp,[sp,#76] @ restore original sp
584 itt ne
585 eorne $t3,$B,$C
586 bne .L_00_48
587
588 ldmia sp!,{r4-r12,pc}
589 .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
590 #endif
591 ___
592 }}}
593 ######################################################################
594 # ARMv8 stuff
595 #
596 {{{
597 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
598 my @MSG=map("q$_",(8..11));
599 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
600 my $Ktbl="r3";
601
602 $code.=<<___;
603 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
604
605 # if defined(__thumb2__)
606 # define INST(a,b,c,d) .byte c,d|0xc,a,b
607 # else
608 # define INST(a,b,c,d) .byte a,b,c,d
609 # endif
610
611 .type sha256_block_data_order_armv8,%function
612 .align 5
613 sha256_block_data_order_armv8:
614 .LARMv8:
615 vld1.32 {$ABCD,$EFGH},[$ctx]
616 sub $Ktbl,$Ktbl,#256+32
617 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
618 b .Loop_v8
619
620 .align 4
621 .Loop_v8:
622 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
623 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
624 vld1.32 {$W0},[$Ktbl]!
625 vrev32.8 @MSG[0],@MSG[0]
626 vrev32.8 @MSG[1],@MSG[1]
627 vrev32.8 @MSG[2],@MSG[2]
628 vrev32.8 @MSG[3],@MSG[3]
629 vmov $ABCD_SAVE,$ABCD @ offload
630 vmov $EFGH_SAVE,$EFGH
631 teq $inp,$len
632 ___
633 for($i=0;$i<12;$i++) {
634 $code.=<<___;
635 vld1.32 {$W1},[$Ktbl]!
636 vadd.i32 $W0,$W0,@MSG[0]
637 sha256su0 @MSG[0],@MSG[1]
638 vmov $abcd,$ABCD
639 sha256h $ABCD,$EFGH,$W0
640 sha256h2 $EFGH,$abcd,$W0
641 sha256su1 @MSG[0],@MSG[2],@MSG[3]
642 ___
643 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
644 }
645 $code.=<<___;
646 vld1.32 {$W1},[$Ktbl]!
647 vadd.i32 $W0,$W0,@MSG[0]
648 vmov $abcd,$ABCD
649 sha256h $ABCD,$EFGH,$W0
650 sha256h2 $EFGH,$abcd,$W0
651
652 vld1.32 {$W0},[$Ktbl]!
653 vadd.i32 $W1,$W1,@MSG[1]
654 vmov $abcd,$ABCD
655 sha256h $ABCD,$EFGH,$W1
656 sha256h2 $EFGH,$abcd,$W1
657
658 vld1.32 {$W1},[$Ktbl]
659 vadd.i32 $W0,$W0,@MSG[2]
660 sub $Ktbl,$Ktbl,#256-16 @ rewind
661 vmov $abcd,$ABCD
662 sha256h $ABCD,$EFGH,$W0
663 sha256h2 $EFGH,$abcd,$W0
664
665 vadd.i32 $W1,$W1,@MSG[3]
666 vmov $abcd,$ABCD
667 sha256h $ABCD,$EFGH,$W1
668 sha256h2 $EFGH,$abcd,$W1
669
670 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
671 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
672 it ne
673 bne .Loop_v8
674
675 vst1.32 {$ABCD,$EFGH},[$ctx]
676
677 ret @ bx lr
678 .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
679 #endif
680 ___
681 }}}
682 $code.=<<___;
683 .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
684 .align 2
685 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
686 .comm OPENSSL_armcap_P,4,4
687 #endif
688 ___
689
690 open SELF,$0;
691 while(<SELF>) {
692 next if (/^#!/);
693 last if (!s/^#/@/ and !/^$/);
694 print;
695 }
696 close SELF;
697
698 { my %opcode = (
699 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
700 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
701
702 sub unsha256 {
703 my ($mnemonic,$arg)=@_;
704
705 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
706 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
707 |(($2&7)<<17)|(($2&8)<<4)
708 |(($3&7)<<1) |(($3&8)<<2);
709 # since ARMv7 instructions are always encoded little-endian.
710 # correct solution is to use .inst directive, but older
711 # assemblers don't implement it:-(
712 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
713 $word&0xff,($word>>8)&0xff,
714 ($word>>16)&0xff,($word>>24)&0xff,
715 $mnemonic,$arg;
716 }
717 }
718 }
719
720 foreach (split($/,$code)) {
721
722 s/\`([^\`]*)\`/eval $1/geo;
723
724 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
725
726 s/\bret\b/bx lr/go or
727 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
728
729 print $_,"\n";
730 }
731
732 close STDOUT; # enforce flush