]> git.ipfire.org Git - thirdparty/kernel/stable.git/blame - arch/arm/crypto/sha256-armv4.pl
crypto: sha256/arm - fix crash bug in Thumb2 build
[thirdparty/kernel/stable.git] / arch / arm / crypto / sha256-armv4.pl
CommitLineData
f2f770d7
ST
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8#
9# Permission to use under GPL terms is granted.
10# ====================================================================
11
12# SHA256 block procedure for ARMv4. May 2007.
13
14# Performance is ~2x better than gcc 3.4 generated code and in "abso-
15# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16# byte [on single-issue Xscale PXA250 core].
17
18# July 2010.
19#
20# Rescheduling for dual-issue pipeline resulted in 22% improvement on
21# Cortex A8 core and ~20 cycles per processed byte.
22
23# February 2011.
24#
25# Profiler-assisted and platform-specific optimization resulted in 16%
26# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
27
28# September 2013.
29#
30# Add NEON implementation. On Cortex A8 it was measured to process one
31# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33# code (meaning that latter performs sub-optimally, nothing was done
34# about it).
35
36# May 2014.
37#
38# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
39
40while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
41open STDOUT,">$output";
42
43$ctx="r0"; $t0="r0";
44$inp="r1"; $t4="r1";
45$len="r2"; $t1="r2";
46$T1="r3"; $t3="r3";
47$A="r4";
48$B="r5";
49$C="r6";
50$D="r7";
51$E="r8";
52$F="r9";
53$G="r10";
54$H="r11";
55@V=($A,$B,$C,$D,$E,$F,$G,$H);
56$t2="r12";
57$Ktbl="r14";
58
59@Sigma0=( 2,13,22);
60@Sigma1=( 6,11,25);
61@sigma0=( 7,18, 3);
62@sigma1=(17,19,10);
63
64sub BODY_00_15 {
65my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
66
67$code.=<<___ if ($i<16);
68#if __ARM_ARCH__>=7
69 @ ldr $t1,[$inp],#4 @ $i
70# if $i==15
71 str $inp,[sp,#17*4] @ make room for $t4
72# endif
73 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
74 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
75 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
76# ifndef __ARMEB__
77 rev $t1,$t1
78# endif
79#else
80 @ ldrb $t1,[$inp,#3] @ $i
81 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
82 ldrb $t2,[$inp,#2]
83 ldrb $t0,[$inp,#1]
84 orr $t1,$t1,$t2,lsl#8
85 ldrb $t2,[$inp],#4
86 orr $t1,$t1,$t0,lsl#16
87# if $i==15
88 str $inp,[sp,#17*4] @ make room for $t4
89# endif
90 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
91 orr $t1,$t1,$t2,lsl#24
92 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
93#endif
94___
95$code.=<<___;
96 ldr $t2,[$Ktbl],#4 @ *K256++
97 add $h,$h,$t1 @ h+=X[i]
98 str $t1,[sp,#`$i%16`*4]
99 eor $t1,$f,$g
100 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
101 and $t1,$t1,$e
102 add $h,$h,$t2 @ h+=K256[i]
103 eor $t1,$t1,$g @ Ch(e,f,g)
104 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
105 add $h,$h,$t1 @ h+=Ch(e,f,g)
106#if $i==31
107 and $t2,$t2,#0xff
108 cmp $t2,#0xf2 @ done?
109#endif
110#if $i<15
111# if __ARM_ARCH__>=7
112 ldr $t1,[$inp],#4 @ prefetch
113# else
114 ldrb $t1,[$inp,#3]
115# endif
116 eor $t2,$a,$b @ a^b, b^c in next round
117#else
118 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
119 eor $t2,$a,$b @ a^b, b^c in next round
120 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
121#endif
122 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
123 and $t3,$t3,$t2 @ (b^c)&=(a^b)
124 add $d,$d,$h @ d+=h
125 eor $t3,$t3,$b @ Maj(a,b,c)
126 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
127 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
128___
129 ($t2,$t3)=($t3,$t2);
130}
131
132sub BODY_16_XX {
133my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
134
135$code.=<<___;
136 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
137 @ ldr $t4,[sp,#`($i+14)%16`*4]
138 mov $t0,$t1,ror#$sigma0[0]
139 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
140 mov $t2,$t4,ror#$sigma1[0]
141 eor $t0,$t0,$t1,ror#$sigma0[1]
142 eor $t2,$t2,$t4,ror#$sigma1[1]
143 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
144 ldr $t1,[sp,#`($i+0)%16`*4]
145 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
146 ldr $t4,[sp,#`($i+9)%16`*4]
147
148 add $t2,$t2,$t0
149 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
150 add $t1,$t1,$t2
151 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
152 add $t1,$t1,$t4 @ X[i]
153___
154 &BODY_00_15(@_);
155}
156
157$code=<<___;
158#ifndef __KERNEL__
159# include "arm_arch.h"
160#else
161# define __ARM_ARCH__ __LINUX_ARM_ARCH__
162# define __ARM_MAX_ARCH__ 7
163#endif
164
165.text
166#if __ARM_ARCH__<7
167.code 32
168#else
169.syntax unified
170# ifdef __thumb2__
171# define adrl adr
172.thumb
173# else
174.code 32
175# endif
176#endif
177
178.type K256,%object
179.align 5
180K256:
181.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
182.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
183.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
184.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
185.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
186.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
187.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
188.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
189.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
190.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
191.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
192.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
193.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
194.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
195.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
196.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
197.size K256,.-K256
198.word 0 @ terminator
199#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
200.LOPENSSL_armcap:
201.word OPENSSL_armcap_P-sha256_block_data_order
202#endif
203.align 5
204
205.global sha256_block_data_order
206.type sha256_block_data_order,%function
207sha256_block_data_order:
92562a9f 208.Lsha256_block_data_order:
f2f770d7
ST
209#if __ARM_ARCH__<7
210 sub r3,pc,#8 @ sha256_block_data_order
211#else
92562a9f 212 adr r3,.Lsha256_block_data_order
f2f770d7
ST
213#endif
214#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
215 ldr r12,.LOPENSSL_armcap
216 ldr r12,[r3,r12] @ OPENSSL_armcap_P
217 tst r12,#ARMV8_SHA256
218 bne .LARMv8
219 tst r12,#ARMV7_NEON
220 bne .LNEON
221#endif
222 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
223 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
224 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
225 sub $Ktbl,r3,#256+32 @ K256
226 sub sp,sp,#16*4 @ alloca(X[16])
227.Loop:
228# if __ARM_ARCH__>=7
229 ldr $t1,[$inp],#4
230# else
231 ldrb $t1,[$inp,#3]
232# endif
233 eor $t3,$B,$C @ magic
234 eor $t2,$t2,$t2
235___
236for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
237$code.=".Lrounds_16_xx:\n";
238for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
239$code.=<<___;
240#if __ARM_ARCH__>=7
241 ite eq @ Thumb2 thing, sanity check in ARM
242#endif
243 ldreq $t3,[sp,#16*4] @ pull ctx
244 bne .Lrounds_16_xx
245
246 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
247 ldr $t0,[$t3,#0]
248 ldr $t1,[$t3,#4]
249 ldr $t2,[$t3,#8]
250 add $A,$A,$t0
251 ldr $t0,[$t3,#12]
252 add $B,$B,$t1
253 ldr $t1,[$t3,#16]
254 add $C,$C,$t2
255 ldr $t2,[$t3,#20]
256 add $D,$D,$t0
257 ldr $t0,[$t3,#24]
258 add $E,$E,$t1
259 ldr $t1,[$t3,#28]
260 add $F,$F,$t2
261 ldr $inp,[sp,#17*4] @ pull inp
262 ldr $t2,[sp,#18*4] @ pull inp+len
263 add $G,$G,$t0
264 add $H,$H,$t1
265 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
266 cmp $inp,$t2
267 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
268 bne .Loop
269
270 add sp,sp,#`16+3`*4 @ destroy frame
271#if __ARM_ARCH__>=5
272 ldmia sp!,{r4-r11,pc}
273#else
274 ldmia sp!,{r4-r11,lr}
275 tst lr,#1
276 moveq pc,lr @ be binary compatible with V4, yet
277 bx lr @ interoperable with Thumb ISA:-)
278#endif
279.size sha256_block_data_order,.-sha256_block_data_order
280___
281######################################################################
282# NEON stuff
283#
284{{{
285my @X=map("q$_",(0..3));
286my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
287my $Xfer=$t4;
288my $j=0;
289
290sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
291sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
292
293sub AUTOLOAD() # thunk [simplified] x86-style perlasm
294{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
295 my $arg = pop;
296 $arg = "#$arg" if ($arg*1 eq $arg);
297 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
298}
299
300sub Xupdate()
301{ use integer;
302 my $body = shift;
303 my @insns = (&$body,&$body,&$body,&$body);
304 my ($a,$b,$c,$d,$e,$f,$g,$h);
305
306 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
307 eval(shift(@insns));
308 eval(shift(@insns));
309 eval(shift(@insns));
310 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
311 eval(shift(@insns));
312 eval(shift(@insns));
313 eval(shift(@insns));
314 &vshr_u32 ($T2,$T0,$sigma0[0]);
315 eval(shift(@insns));
316 eval(shift(@insns));
317 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
318 eval(shift(@insns));
319 eval(shift(@insns));
320 &vshr_u32 ($T1,$T0,$sigma0[2]);
321 eval(shift(@insns));
322 eval(shift(@insns));
323 &vsli_32 ($T2,$T0,32-$sigma0[0]);
324 eval(shift(@insns));
325 eval(shift(@insns));
326 &vshr_u32 ($T3,$T0,$sigma0[1]);
327 eval(shift(@insns));
328 eval(shift(@insns));
329 &veor ($T1,$T1,$T2);
330 eval(shift(@insns));
331 eval(shift(@insns));
332 &vsli_32 ($T3,$T0,32-$sigma0[1]);
333 eval(shift(@insns));
334 eval(shift(@insns));
335 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
336 eval(shift(@insns));
337 eval(shift(@insns));
338 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
339 eval(shift(@insns));
340 eval(shift(@insns));
341 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
342 eval(shift(@insns));
343 eval(shift(@insns));
344 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
345 eval(shift(@insns));
346 eval(shift(@insns));
347 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
348 eval(shift(@insns));
349 eval(shift(@insns));
350 &veor ($T5,$T5,$T4);
351 eval(shift(@insns));
352 eval(shift(@insns));
353 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
354 eval(shift(@insns));
355 eval(shift(@insns));
356 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
357 eval(shift(@insns));
358 eval(shift(@insns));
359 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
360 eval(shift(@insns));
361 eval(shift(@insns));
362 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
363 eval(shift(@insns));
364 eval(shift(@insns));
365 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
366 eval(shift(@insns));
367 eval(shift(@insns));
368 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
369 eval(shift(@insns));
370 eval(shift(@insns));
371 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
372 eval(shift(@insns));
373 eval(shift(@insns));
374 &veor ($T5,$T5,$T4);
375 eval(shift(@insns));
376 eval(shift(@insns));
377 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
378 eval(shift(@insns));
379 eval(shift(@insns));
380 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
381 eval(shift(@insns));
382 eval(shift(@insns));
383 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
384 eval(shift(@insns));
385 eval(shift(@insns));
386 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
387 eval(shift(@insns));
388 eval(shift(@insns));
389 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
390 eval(shift(@insns));
391 eval(shift(@insns));
392 &vadd_i32 ($T0,$T0,@X[0]);
393 while($#insns>=2) { eval(shift(@insns)); }
394 &vst1_32 ("{$T0}","[$Xfer,:128]!");
395 eval(shift(@insns));
396 eval(shift(@insns));
397
398 push(@X,shift(@X)); # "rotate" X[]
399}
400
401sub Xpreload()
402{ use integer;
403 my $body = shift;
404 my @insns = (&$body,&$body,&$body,&$body);
405 my ($a,$b,$c,$d,$e,$f,$g,$h);
406
407 eval(shift(@insns));
408 eval(shift(@insns));
409 eval(shift(@insns));
410 eval(shift(@insns));
411 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
412 eval(shift(@insns));
413 eval(shift(@insns));
414 eval(shift(@insns));
415 eval(shift(@insns));
416 &vrev32_8 (@X[0],@X[0]);
417 eval(shift(@insns));
418 eval(shift(@insns));
419 eval(shift(@insns));
420 eval(shift(@insns));
421 &vadd_i32 ($T0,$T0,@X[0]);
422 foreach (@insns) { eval; } # remaining instructions
423 &vst1_32 ("{$T0}","[$Xfer,:128]!");
424
425 push(@X,shift(@X)); # "rotate" X[]
426}
427
428sub body_00_15 () {
429 (
430 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
431 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
432 '&eor ($t1,$f,$g)',
433 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
434 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
435 '&and ($t1,$t1,$e)',
436 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
437 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
438 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
439 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
440 '&eor ($t2,$a,$b)', # a^b, b^c in next round
441 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
442 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
443 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
444 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
445 '&ldr ($t1,"[sp,#64]") if ($j==31)',
446 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
447 '&add ($d,$d,$h)', # d+=h
448 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
449 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
450 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
451 )
452}
453
454$code.=<<___;
455#if __ARM_MAX_ARCH__>=7
456.arch armv7-a
457.fpu neon
458
459.global sha256_block_data_order_neon
460.type sha256_block_data_order_neon,%function
461.align 4
462sha256_block_data_order_neon:
463.LNEON:
464 stmdb sp!,{r4-r12,lr}
465
466 sub $H,sp,#16*4+16
467 adrl $Ktbl,K256
468 bic $H,$H,#15 @ align for 128-bit stores
469 mov $t2,sp
470 mov sp,$H @ alloca
471 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
472
473 vld1.8 {@X[0]},[$inp]!
474 vld1.8 {@X[1]},[$inp]!
475 vld1.8 {@X[2]},[$inp]!
476 vld1.8 {@X[3]},[$inp]!
477 vld1.32 {$T0},[$Ktbl,:128]!
478 vld1.32 {$T1},[$Ktbl,:128]!
479 vld1.32 {$T2},[$Ktbl,:128]!
480 vld1.32 {$T3},[$Ktbl,:128]!
481 vrev32.8 @X[0],@X[0] @ yes, even on
482 str $ctx,[sp,#64]
483 vrev32.8 @X[1],@X[1] @ big-endian
484 str $inp,[sp,#68]
485 mov $Xfer,sp
486 vrev32.8 @X[2],@X[2]
487 str $len,[sp,#72]
488 vrev32.8 @X[3],@X[3]
489 str $t2,[sp,#76] @ save original sp
490 vadd.i32 $T0,$T0,@X[0]
491 vadd.i32 $T1,$T1,@X[1]
492 vst1.32 {$T0},[$Xfer,:128]!
493 vadd.i32 $T2,$T2,@X[2]
494 vst1.32 {$T1},[$Xfer,:128]!
495 vadd.i32 $T3,$T3,@X[3]
496 vst1.32 {$T2},[$Xfer,:128]!
497 vst1.32 {$T3},[$Xfer,:128]!
498
499 ldmia $ctx,{$A-$H}
500 sub $Xfer,$Xfer,#64
501 ldr $t1,[sp,#0]
502 eor $t2,$t2,$t2
503 eor $t3,$B,$C
504 b .L_00_48
505
506.align 4
507.L_00_48:
508___
509 &Xupdate(\&body_00_15);
510 &Xupdate(\&body_00_15);
511 &Xupdate(\&body_00_15);
512 &Xupdate(\&body_00_15);
513$code.=<<___;
514 teq $t1,#0 @ check for K256 terminator
515 ldr $t1,[sp,#0]
516 sub $Xfer,$Xfer,#64
517 bne .L_00_48
518
519 ldr $inp,[sp,#68]
520 ldr $t0,[sp,#72]
521 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
522 teq $inp,$t0
523 it eq
524 subeq $inp,$inp,#64 @ avoid SEGV
525 vld1.8 {@X[0]},[$inp]! @ load next input block
526 vld1.8 {@X[1]},[$inp]!
527 vld1.8 {@X[2]},[$inp]!
528 vld1.8 {@X[3]},[$inp]!
529 it ne
530 strne $inp,[sp,#68]
531 mov $Xfer,sp
532___
533 &Xpreload(\&body_00_15);
534 &Xpreload(\&body_00_15);
535 &Xpreload(\&body_00_15);
536 &Xpreload(\&body_00_15);
537$code.=<<___;
538 ldr $t0,[$t1,#0]
539 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
540 ldr $t2,[$t1,#4]
541 ldr $t3,[$t1,#8]
542 ldr $t4,[$t1,#12]
543 add $A,$A,$t0 @ accumulate
544 ldr $t0,[$t1,#16]
545 add $B,$B,$t2
546 ldr $t2,[$t1,#20]
547 add $C,$C,$t3
548 ldr $t3,[$t1,#24]
549 add $D,$D,$t4
550 ldr $t4,[$t1,#28]
551 add $E,$E,$t0
552 str $A,[$t1],#4
553 add $F,$F,$t2
554 str $B,[$t1],#4
555 add $G,$G,$t3
556 str $C,[$t1],#4
557 add $H,$H,$t4
558 str $D,[$t1],#4
559 stmia $t1,{$E-$H}
560
561 ittte ne
562 movne $Xfer,sp
563 ldrne $t1,[sp,#0]
564 eorne $t2,$t2,$t2
565 ldreq sp,[sp,#76] @ restore original sp
566 itt ne
567 eorne $t3,$B,$C
568 bne .L_00_48
569
570 ldmia sp!,{r4-r12,pc}
571.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
572#endif
573___
574}}}
575######################################################################
576# ARMv8 stuff
577#
578{{{
579my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
580my @MSG=map("q$_",(8..11));
581my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
582my $Ktbl="r3";
583
584$code.=<<___;
585#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
586
587# ifdef __thumb2__
588# define INST(a,b,c,d) .byte c,d|0xc,a,b
589# else
590# define INST(a,b,c,d) .byte a,b,c,d
591# endif
592
593.type sha256_block_data_order_armv8,%function
594.align 5
595sha256_block_data_order_armv8:
596.LARMv8:
597 vld1.32 {$ABCD,$EFGH},[$ctx]
598# ifdef __thumb2__
599 adr $Ktbl,.LARMv8
600 sub $Ktbl,$Ktbl,#.LARMv8-K256
601# else
602 adrl $Ktbl,K256
603# endif
604 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
605
606.Loop_v8:
607 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
608 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
609 vld1.32 {$W0},[$Ktbl]!
610 vrev32.8 @MSG[0],@MSG[0]
611 vrev32.8 @MSG[1],@MSG[1]
612 vrev32.8 @MSG[2],@MSG[2]
613 vrev32.8 @MSG[3],@MSG[3]
614 vmov $ABCD_SAVE,$ABCD @ offload
615 vmov $EFGH_SAVE,$EFGH
616 teq $inp,$len
617___
618for($i=0;$i<12;$i++) {
619$code.=<<___;
620 vld1.32 {$W1},[$Ktbl]!
621 vadd.i32 $W0,$W0,@MSG[0]
622 sha256su0 @MSG[0],@MSG[1]
623 vmov $abcd,$ABCD
624 sha256h $ABCD,$EFGH,$W0
625 sha256h2 $EFGH,$abcd,$W0
626 sha256su1 @MSG[0],@MSG[2],@MSG[3]
627___
628 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
629}
630$code.=<<___;
631 vld1.32 {$W1},[$Ktbl]!
632 vadd.i32 $W0,$W0,@MSG[0]
633 vmov $abcd,$ABCD
634 sha256h $ABCD,$EFGH,$W0
635 sha256h2 $EFGH,$abcd,$W0
636
637 vld1.32 {$W0},[$Ktbl]!
638 vadd.i32 $W1,$W1,@MSG[1]
639 vmov $abcd,$ABCD
640 sha256h $ABCD,$EFGH,$W1
641 sha256h2 $EFGH,$abcd,$W1
642
643 vld1.32 {$W1},[$Ktbl]
644 vadd.i32 $W0,$W0,@MSG[2]
645 sub $Ktbl,$Ktbl,#256-16 @ rewind
646 vmov $abcd,$ABCD
647 sha256h $ABCD,$EFGH,$W0
648 sha256h2 $EFGH,$abcd,$W0
649
650 vadd.i32 $W1,$W1,@MSG[3]
651 vmov $abcd,$ABCD
652 sha256h $ABCD,$EFGH,$W1
653 sha256h2 $EFGH,$abcd,$W1
654
655 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
656 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
657 it ne
658 bne .Loop_v8
659
660 vst1.32 {$ABCD,$EFGH},[$ctx]
661
662 ret @ bx lr
663.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
664#endif
665___
666}}}
667$code.=<<___;
668.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
669.align 2
670#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
671.comm OPENSSL_armcap_P,4,4
672#endif
673___
674
675open SELF,$0;
676while(<SELF>) {
677 next if (/^#!/);
678 last if (!s/^#/@/ and !/^$/);
679 print;
680}
681close SELF;
682
683{ my %opcode = (
684 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
685 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
686
687 sub unsha256 {
688 my ($mnemonic,$arg)=@_;
689
690 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
691 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
692 |(($2&7)<<17)|(($2&8)<<4)
693 |(($3&7)<<1) |(($3&8)<<2);
694 # since ARMv7 instructions are always encoded little-endian.
695 # correct solution is to use .inst directive, but older
696 # assemblers don't implement it:-(
697 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
698 $word&0xff,($word>>8)&0xff,
699 ($word>>16)&0xff,($word>>24)&0xff,
700 $mnemonic,$arg;
701 }
702 }
703}
704
705foreach (split($/,$code)) {
706
707 s/\`([^\`]*)\`/eval $1/geo;
708
709 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
710
711 s/\bret\b/bx lr/go or
712 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
713
714 print $_,"\n";
715}
716
717close STDOUT; # enforce flush