]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha256-armv4.pl
Update copyright year
[thirdparty/openssl.git] / crypto / sha / asm / sha256-armv4.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
33388b44 2# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
a598ed0d 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
a1a382db
AP
9
10# ====================================================================
ad0d2579 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
a1a382db
AP
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
2ecd32a1
AP
15#
16# Permission to use under GPL terms is granted.
a1a382db
AP
17# ====================================================================
18
399f94bf 19# SHA256 block procedure for ARMv4. May 2007.
a1a382db 20
399f94bf
AP
21# Performance is ~2x better than gcc 3.4 generated code and in "abso-
22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
2d22e080
AP
23# byte [on single-issue Xscale PXA250 core].
24
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 22% improvement on
28# Cortex A8 core and ~20 cycles per processed byte.
a1a382db 29
1e863180
AP
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 16%
ad0d2579
AP
33# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35# September 2013.
36#
37# Add NEON implementation. On Cortex A8 it was measured to process one
38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40# code (meaning that latter performs sub-optimally, nothing was done
41# about it).
1e863180 42
9250a306
AP
43# May 2014.
44#
45# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
1aa89a7a
RL
47# $output is the last argument if it looks like a file (it has an extension)
48# $flavour is the first argument if it doesn't look like a file
49$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
313e6ec1
AP
51
52if ($flavour && $flavour ne "void") {
53 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
55 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
56 die "can't locate arm-xlate.pl";
57
1aa89a7a
RL
58 open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
59 or die "can't call $xlate: $!";
313e6ec1 60} else {
1aa89a7a 61 $output and open STDOUT,">$output";
313e6ec1 62}
4c7c5ff6 63
a1a382db 64$ctx="r0"; $t0="r0";
e09039c0 65$inp="r1"; $t4="r1";
a1a382db 66$len="r2"; $t1="r2";
e09039c0 67$T1="r3"; $t3="r3";
a1a382db
AP
68$A="r4";
69$B="r5";
70$C="r6";
71$D="r7";
72$E="r8";
73$F="r9";
74$G="r10";
75$H="r11";
76@V=($A,$B,$C,$D,$E,$F,$G,$H);
77$t2="r12";
78$Ktbl="r14";
79
80@Sigma0=( 2,13,22);
81@Sigma1=( 6,11,25);
82@sigma0=( 7,18, 3);
83@sigma1=(17,19,10);
84
85sub BODY_00_15 {
86my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
87
88$code.=<<___ if ($i<16);
1e863180 89#if __ARM_ARCH__>=7
e09039c0
AP
90 @ ldr $t1,[$inp],#4 @ $i
91# if $i==15
92 str $inp,[sp,#17*4] @ make room for $t4
93# endif
ad0d2579 94 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
e09039c0 95 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
ad0d2579 96 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
51f8d095 97# ifndef __ARMEB__
e09039c0 98 rev $t1,$t1
51f8d095 99# endif
1e863180 100#else
e09039c0
AP
101 @ ldrb $t1,[$inp,#3] @ $i
102 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
a1a382db 103 ldrb $t2,[$inp,#2]
e09039c0
AP
104 ldrb $t0,[$inp,#1]
105 orr $t1,$t1,$t2,lsl#8
106 ldrb $t2,[$inp],#4
107 orr $t1,$t1,$t0,lsl#16
108# if $i==15
109 str $inp,[sp,#17*4] @ make room for $t4
110# endif
ad0d2579 111 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
e09039c0 112 orr $t1,$t1,$t2,lsl#24
ad0d2579 113 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
1e863180 114#endif
a1a382db
AP
115___
116$code.=<<___;
1e863180 117 ldr $t2,[$Ktbl],#4 @ *K256++
e09039c0
AP
118 add $h,$h,$t1 @ h+=X[i]
119 str $t1,[sp,#`$i%16`*4]
a1a382db 120 eor $t1,$f,$g
ad0d2579 121 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
a1a382db 122 and $t1,$t1,$e
e09039c0 123 add $h,$h,$t2 @ h+=K256[i]
ad0d2579
AP
124 eor $t1,$t1,$g @ Ch(e,f,g)
125 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
e09039c0
AP
126 add $h,$h,$t1 @ h+=Ch(e,f,g)
127#if $i==31
128 and $t2,$t2,#0xff
129 cmp $t2,#0xf2 @ done?
1e863180 130#endif
e09039c0
AP
131#if $i<15
132# if __ARM_ARCH__>=7
133 ldr $t1,[$inp],#4 @ prefetch
134# else
135 ldrb $t1,[$inp,#3]
136# endif
137 eor $t2,$a,$b @ a^b, b^c in next round
138#else
139 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
140 eor $t2,$a,$b @ a^b, b^c in next round
141 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
142#endif
ad0d2579 143 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
e09039c0
AP
144 and $t3,$t3,$t2 @ (b^c)&=(a^b)
145 add $d,$d,$h @ d+=h
e09039c0 146 eor $t3,$t3,$b @ Maj(a,b,c)
ad0d2579 147 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
e09039c0 148 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
a1a382db 149___
e09039c0 150 ($t2,$t3)=($t3,$t2);
a1a382db
AP
151}
152
153sub BODY_16_XX {
154my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
155
156$code.=<<___;
e09039c0
AP
157 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
158 @ ldr $t4,[sp,#`($i+14)%16`*4]
159 mov $t0,$t1,ror#$sigma0[0]
160 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
161 mov $t2,$t4,ror#$sigma1[0]
162 eor $t0,$t0,$t1,ror#$sigma0[1]
163 eor $t2,$t2,$t4,ror#$sigma1[1]
164 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
165 ldr $t1,[sp,#`($i+0)%16`*4]
166 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
167 ldr $t4,[sp,#`($i+9)%16`*4]
168
169 add $t2,$t2,$t0
ad0d2579 170 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
e09039c0 171 add $t1,$t1,$t2
ad0d2579 172 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
e09039c0 173 add $t1,$t1,$t4 @ X[i]
a1a382db
AP
174___
175 &BODY_00_15(@_);
176}
177
178$code=<<___;
2ecd32a1
AP
179#ifndef __KERNEL__
180# include "arm_arch.h"
181#else
182# define __ARM_ARCH__ __LINUX_ARM_ARCH__
183# define __ARM_MAX_ARCH__ 7
184#endif
1e863180 185
a2859927 186#if defined(__thumb2__)
2ecd32a1 187.syntax unified
2ecd32a1 188.thumb
11208dcf 189#else
2ecd32a1 190.code 32
2ecd32a1 191#endif
a1a382db 192
3405db97
AP
193.text
194
a1a382db
AP
195.type K256,%object
196.align 5
197K256:
198.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
199.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
200.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
201.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
202.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
203.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
204.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
205.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
206.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
207.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
208.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
209.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
210.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
211.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
212.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
213.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
214.size K256,.-K256
ad0d2579 215.word 0 @ terminator
2ecd32a1 216#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ad0d2579 217.LOPENSSL_armcap:
3405db97
AP
218# ifdef _WIN32
219.word OPENSSL_armcap_P
220# else
313e6ec1 221.word OPENSSL_armcap_P-.Lsha256_block_data_order
3405db97 222# endif
c1669e1c 223#endif
ad0d2579 224.align 5
a1a382db
AP
225
226.global sha256_block_data_order
227.type sha256_block_data_order,%function
228sha256_block_data_order:
313e6ec1 229.Lsha256_block_data_order:
11208dcf 230#if __ARM_ARCH__<7 && !defined(__thumb2__)
a1a382db 231 sub r3,pc,#8 @ sha256_block_data_order
2ecd32a1 232#else
11208dcf 233 adr r3,.Lsha256_block_data_order
2ecd32a1
AP
234#endif
235#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ad0d2579 236 ldr r12,.LOPENSSL_armcap
3405db97 237# if !defined(_WIN32)
ad0d2579 238 ldr r12,[r3,r12] @ OPENSSL_armcap_P
3405db97
AP
239# endif
240# if defined(__APPLE__) || defined(_WIN32)
313e6ec1 241 ldr r12,[r12]
3405db97 242# endif
797d24be 243 tst r12,#ARMV8_SHA256
9250a306 244 bne .LARMv8
797d24be 245 tst r12,#ARMV7_NEON
ad0d2579
AP
246 bne .LNEON
247#endif
2ecd32a1 248 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
1e863180 249 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
a1a382db 250 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
ad0d2579 251 sub $Ktbl,r3,#256+32 @ K256
a1a382db
AP
252 sub sp,sp,#16*4 @ alloca(X[16])
253.Loop:
e09039c0
AP
254# if __ARM_ARCH__>=7
255 ldr $t1,[$inp],#4
256# else
257 ldrb $t1,[$inp,#3]
258# endif
259 eor $t3,$B,$C @ magic
260 eor $t2,$t2,$t2
a1a382db
AP
261___
262for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
263$code.=".Lrounds_16_xx:\n";
264for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
265$code.=<<___;
2e51557b 266#ifdef __thumb2__
2ecd32a1
AP
267 ite eq @ Thumb2 thing, sanity check in ARM
268#endif
e09039c0 269 ldreq $t3,[sp,#16*4] @ pull ctx
a1a382db
AP
270 bne .Lrounds_16_xx
271
e09039c0
AP
272 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
273 ldr $t0,[$t3,#0]
274 ldr $t1,[$t3,#4]
275 ldr $t2,[$t3,#8]
a1a382db 276 add $A,$A,$t0
e09039c0 277 ldr $t0,[$t3,#12]
a1a382db 278 add $B,$B,$t1
e09039c0 279 ldr $t1,[$t3,#16]
a1a382db 280 add $C,$C,$t2
e09039c0 281 ldr $t2,[$t3,#20]
a1a382db 282 add $D,$D,$t0
e09039c0 283 ldr $t0,[$t3,#24]
a1a382db 284 add $E,$E,$t1
e09039c0 285 ldr $t1,[$t3,#28]
a1a382db
AP
286 add $F,$F,$t2
287 ldr $inp,[sp,#17*4] @ pull inp
288 ldr $t2,[sp,#18*4] @ pull inp+len
289 add $G,$G,$t0
290 add $H,$H,$t1
e09039c0 291 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
a1a382db
AP
292 cmp $inp,$t2
293 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
294 bne .Loop
295
296 add sp,sp,#`16+3`*4 @ destroy frame
1e863180
AP
297#if __ARM_ARCH__>=5
298 ldmia sp!,{r4-r11,pc}
299#else
300 ldmia sp!,{r4-r11,lr}
a1a382db
AP
301 tst lr,#1
302 moveq pc,lr @ be binary compatible with V4, yet
303 bx lr @ interoperable with Thumb ISA:-)
1e863180 304#endif
9250a306 305.size sha256_block_data_order,.-sha256_block_data_order
ad0d2579
AP
306___
307######################################################################
308# NEON stuff
309#
310{{{
311my @X=map("q$_",(0..3));
312my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
313my $Xfer=$t4;
314my $j=0;
315
316sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
317sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
318
319sub AUTOLOAD() # thunk [simplified] x86-style perlasm
320{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
321 my $arg = pop;
322 $arg = "#$arg" if ($arg*1 eq $arg);
323 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
324}
325
326sub Xupdate()
327{ use integer;
328 my $body = shift;
329 my @insns = (&$body,&$body,&$body,&$body);
330 my ($a,$b,$c,$d,$e,$f,$g,$h);
331
332 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
333 eval(shift(@insns));
334 eval(shift(@insns));
335 eval(shift(@insns));
336 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
337 eval(shift(@insns));
338 eval(shift(@insns));
339 eval(shift(@insns));
340 &vshr_u32 ($T2,$T0,$sigma0[0]);
341 eval(shift(@insns));
342 eval(shift(@insns));
343 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
344 eval(shift(@insns));
345 eval(shift(@insns));
346 &vshr_u32 ($T1,$T0,$sigma0[2]);
347 eval(shift(@insns));
348 eval(shift(@insns));
349 &vsli_32 ($T2,$T0,32-$sigma0[0]);
350 eval(shift(@insns));
351 eval(shift(@insns));
352 &vshr_u32 ($T3,$T0,$sigma0[1]);
353 eval(shift(@insns));
354 eval(shift(@insns));
355 &veor ($T1,$T1,$T2);
356 eval(shift(@insns));
357 eval(shift(@insns));
358 &vsli_32 ($T3,$T0,32-$sigma0[1]);
359 eval(shift(@insns));
360 eval(shift(@insns));
361 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
362 eval(shift(@insns));
363 eval(shift(@insns));
364 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
365 eval(shift(@insns));
366 eval(shift(@insns));
367 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
368 eval(shift(@insns));
369 eval(shift(@insns));
370 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
371 eval(shift(@insns));
372 eval(shift(@insns));
373 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
374 eval(shift(@insns));
375 eval(shift(@insns));
376 &veor ($T5,$T5,$T4);
377 eval(shift(@insns));
378 eval(shift(@insns));
379 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
380 eval(shift(@insns));
381 eval(shift(@insns));
382 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
383 eval(shift(@insns));
384 eval(shift(@insns));
385 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
386 eval(shift(@insns));
387 eval(shift(@insns));
388 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
389 eval(shift(@insns));
390 eval(shift(@insns));
391 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
392 eval(shift(@insns));
393 eval(shift(@insns));
394 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
395 eval(shift(@insns));
396 eval(shift(@insns));
397 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
398 eval(shift(@insns));
399 eval(shift(@insns));
400 &veor ($T5,$T5,$T4);
401 eval(shift(@insns));
402 eval(shift(@insns));
403 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
404 eval(shift(@insns));
405 eval(shift(@insns));
406 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
407 eval(shift(@insns));
408 eval(shift(@insns));
409 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
410 eval(shift(@insns));
411 eval(shift(@insns));
412 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
413 eval(shift(@insns));
414 eval(shift(@insns));
415 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
416 eval(shift(@insns));
417 eval(shift(@insns));
418 &vadd_i32 ($T0,$T0,@X[0]);
419 while($#insns>=2) { eval(shift(@insns)); }
420 &vst1_32 ("{$T0}","[$Xfer,:128]!");
421 eval(shift(@insns));
422 eval(shift(@insns));
423
424 push(@X,shift(@X)); # "rotate" X[]
425}
426
427sub Xpreload()
428{ use integer;
429 my $body = shift;
430 my @insns = (&$body,&$body,&$body,&$body);
431 my ($a,$b,$c,$d,$e,$f,$g,$h);
432
433 eval(shift(@insns));
434 eval(shift(@insns));
435 eval(shift(@insns));
436 eval(shift(@insns));
437 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
438 eval(shift(@insns));
439 eval(shift(@insns));
440 eval(shift(@insns));
441 eval(shift(@insns));
442 &vrev32_8 (@X[0],@X[0]);
443 eval(shift(@insns));
444 eval(shift(@insns));
445 eval(shift(@insns));
446 eval(shift(@insns));
447 &vadd_i32 ($T0,$T0,@X[0]);
448 foreach (@insns) { eval; } # remaining instructions
449 &vst1_32 ("{$T0}","[$Xfer,:128]!");
450
451 push(@X,shift(@X)); # "rotate" X[]
452}
453
454sub body_00_15 () {
455 (
456 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
457 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
458 '&eor ($t1,$f,$g)',
459 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
460 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
461 '&and ($t1,$t1,$e)',
462 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
463 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
464 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
465 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
466 '&eor ($t2,$a,$b)', # a^b, b^c in next round
467 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
468 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
469 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
470 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
471 '&ldr ($t1,"[sp,#64]") if ($j==31)',
472 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
473 '&add ($d,$d,$h)', # d+=h
474 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
475 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
476 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
477 )
478}
479
480$code.=<<___;
c1669e1c
AP
481#if __ARM_MAX_ARCH__>=7
482.arch armv7-a
ad0d2579 483.fpu neon
9250a306 484
2ecd32a1 485.global sha256_block_data_order_neon
9250a306 486.type sha256_block_data_order_neon,%function
cfe67073
AP
487.align 5
488.skip 16
9250a306 489sha256_block_data_order_neon:
ad0d2579
AP
490.LNEON:
491 stmdb sp!,{r4-r12,lr}
492
2ecd32a1 493 sub $H,sp,#16*4+16
313e6ec1 494 adr $Ktbl,K256
2ecd32a1 495 bic $H,$H,#15 @ align for 128-bit stores
ad0d2579 496 mov $t2,sp
2ecd32a1
AP
497 mov sp,$H @ alloca
498 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
ad0d2579
AP
499
500 vld1.8 {@X[0]},[$inp]!
501 vld1.8 {@X[1]},[$inp]!
502 vld1.8 {@X[2]},[$inp]!
503 vld1.8 {@X[3]},[$inp]!
504 vld1.32 {$T0},[$Ktbl,:128]!
505 vld1.32 {$T1},[$Ktbl,:128]!
506 vld1.32 {$T2},[$Ktbl,:128]!
507 vld1.32 {$T3},[$Ktbl,:128]!
508 vrev32.8 @X[0],@X[0] @ yes, even on
509 str $ctx,[sp,#64]
510 vrev32.8 @X[1],@X[1] @ big-endian
511 str $inp,[sp,#68]
512 mov $Xfer,sp
513 vrev32.8 @X[2],@X[2]
514 str $len,[sp,#72]
515 vrev32.8 @X[3],@X[3]
516 str $t2,[sp,#76] @ save original sp
517 vadd.i32 $T0,$T0,@X[0]
518 vadd.i32 $T1,$T1,@X[1]
519 vst1.32 {$T0},[$Xfer,:128]!
520 vadd.i32 $T2,$T2,@X[2]
521 vst1.32 {$T1},[$Xfer,:128]!
522 vadd.i32 $T3,$T3,@X[3]
523 vst1.32 {$T2},[$Xfer,:128]!
524 vst1.32 {$T3},[$Xfer,:128]!
525
526 ldmia $ctx,{$A-$H}
527 sub $Xfer,$Xfer,#64
528 ldr $t1,[sp,#0]
529 eor $t2,$t2,$t2
530 eor $t3,$B,$C
531 b .L_00_48
532
533.align 4
534.L_00_48:
535___
536 &Xupdate(\&body_00_15);
537 &Xupdate(\&body_00_15);
538 &Xupdate(\&body_00_15);
539 &Xupdate(\&body_00_15);
540$code.=<<___;
541 teq $t1,#0 @ check for K256 terminator
542 ldr $t1,[sp,#0]
543 sub $Xfer,$Xfer,#64
544 bne .L_00_48
545
546 ldr $inp,[sp,#68]
547 ldr $t0,[sp,#72]
548 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
549 teq $inp,$t0
2ecd32a1 550 it eq
ad0d2579
AP
551 subeq $inp,$inp,#64 @ avoid SEGV
552 vld1.8 {@X[0]},[$inp]! @ load next input block
553 vld1.8 {@X[1]},[$inp]!
554 vld1.8 {@X[2]},[$inp]!
555 vld1.8 {@X[3]},[$inp]!
2ecd32a1 556 it ne
ad0d2579
AP
557 strne $inp,[sp,#68]
558 mov $Xfer,sp
559___
560 &Xpreload(\&body_00_15);
561 &Xpreload(\&body_00_15);
562 &Xpreload(\&body_00_15);
563 &Xpreload(\&body_00_15);
564$code.=<<___;
565 ldr $t0,[$t1,#0]
566 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
567 ldr $t2,[$t1,#4]
568 ldr $t3,[$t1,#8]
569 ldr $t4,[$t1,#12]
570 add $A,$A,$t0 @ accumulate
571 ldr $t0,[$t1,#16]
572 add $B,$B,$t2
573 ldr $t2,[$t1,#20]
574 add $C,$C,$t3
575 ldr $t3,[$t1,#24]
576 add $D,$D,$t4
577 ldr $t4,[$t1,#28]
578 add $E,$E,$t0
579 str $A,[$t1],#4
580 add $F,$F,$t2
581 str $B,[$t1],#4
582 add $G,$G,$t3
583 str $C,[$t1],#4
584 add $H,$H,$t4
585 str $D,[$t1],#4
586 stmia $t1,{$E-$H}
587
2ecd32a1 588 ittte ne
ad0d2579
AP
589 movne $Xfer,sp
590 ldrne $t1,[sp,#0]
591 eorne $t2,$t2,$t2
592 ldreq sp,[sp,#76] @ restore original sp
2ecd32a1 593 itt ne
ad0d2579
AP
594 eorne $t3,$B,$C
595 bne .L_00_48
596
597 ldmia sp!,{r4-r12,pc}
9250a306
AP
598.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
599#endif
600___
601}}}
602######################################################################
603# ARMv8 stuff
604#
605{{{
606my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
607my @MSG=map("q$_",(8..11));
608my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
609my $Ktbl="r3";
3405db97 610my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
9250a306
AP
611
612$code.=<<___;
2ecd32a1
AP
613#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
614
a2859927 615# if defined(__thumb2__)
3405db97 616# define INST(a,b,c,d) $_byte c,d|0xc,a,b
2ecd32a1 617# else
3405db97 618# define INST(a,b,c,d) $_byte a,b,c,d
2ecd32a1
AP
619# endif
620
9250a306
AP
621.type sha256_block_data_order_armv8,%function
622.align 5
623sha256_block_data_order_armv8:
624.LARMv8:
625 vld1.32 {$ABCD,$EFGH},[$ctx]
313e6ec1 626 sub $Ktbl,$Ktbl,#256+32
2ecd32a1 627 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
cfe67073 628 b .Loop_v8
9250a306 629
cfe67073 630.align 4
9250a306
AP
631.Loop_v8:
632 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
633 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
634 vld1.32 {$W0},[$Ktbl]!
635 vrev32.8 @MSG[0],@MSG[0]
636 vrev32.8 @MSG[1],@MSG[1]
637 vrev32.8 @MSG[2],@MSG[2]
638 vrev32.8 @MSG[3],@MSG[3]
639 vmov $ABCD_SAVE,$ABCD @ offload
640 vmov $EFGH_SAVE,$EFGH
641 teq $inp,$len
642___
643for($i=0;$i<12;$i++) {
644$code.=<<___;
645 vld1.32 {$W1},[$Ktbl]!
646 vadd.i32 $W0,$W0,@MSG[0]
647 sha256su0 @MSG[0],@MSG[1]
648 vmov $abcd,$ABCD
649 sha256h $ABCD,$EFGH,$W0
650 sha256h2 $EFGH,$abcd,$W0
651 sha256su1 @MSG[0],@MSG[2],@MSG[3]
652___
653 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
654}
655$code.=<<___;
656 vld1.32 {$W1},[$Ktbl]!
657 vadd.i32 $W0,$W0,@MSG[0]
658 vmov $abcd,$ABCD
659 sha256h $ABCD,$EFGH,$W0
660 sha256h2 $EFGH,$abcd,$W0
661
662 vld1.32 {$W0},[$Ktbl]!
663 vadd.i32 $W1,$W1,@MSG[1]
664 vmov $abcd,$ABCD
665 sha256h $ABCD,$EFGH,$W1
666 sha256h2 $EFGH,$abcd,$W1
667
668 vld1.32 {$W1},[$Ktbl]
669 vadd.i32 $W0,$W0,@MSG[2]
670 sub $Ktbl,$Ktbl,#256-16 @ rewind
671 vmov $abcd,$ABCD
672 sha256h $ABCD,$EFGH,$W0
673 sha256h2 $EFGH,$abcd,$W0
674
675 vadd.i32 $W1,$W1,@MSG[3]
676 vmov $abcd,$ABCD
677 sha256h $ABCD,$EFGH,$W1
678 sha256h2 $EFGH,$abcd,$W1
679
680 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
681 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
2ecd32a1 682 it ne
9250a306
AP
683 bne .Loop_v8
684
685 vst1.32 {$ABCD,$EFGH},[$ctx]
686
5dcf70a1 687 ret @ bx lr
9250a306 688.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
ad0d2579
AP
689#endif
690___
691}}}
692$code.=<<___;
9250a306 693.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
97a6a01f 694.align 2
2ecd32a1 695#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ad0d2579 696.comm OPENSSL_armcap_P,4,4
c1669e1c 697#endif
a1a382db
AP
698___
699
2ecd32a1
AP
700open SELF,$0;
701while(<SELF>) {
702 next if (/^#!/);
703 last if (!s/^#/@/ and !/^$/);
704 print;
705}
706close SELF;
707
9250a306
AP
708{ my %opcode = (
709 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
710 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
711
712 sub unsha256 {
713 my ($mnemonic,$arg)=@_;
714
5dcf70a1
AP
715 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
716 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
717 |(($2&7)<<17)|(($2&8)<<4)
718 |(($3&7)<<1) |(($3&8)<<2);
719 # since ARMv7 instructions are always encoded little-endian.
720 # correct solution is to use .inst directive, but older
721 # assemblers don't implement it:-(
2ecd32a1 722 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
5dcf70a1
AP
723 $word&0xff,($word>>8)&0xff,
724 ($word>>16)&0xff,($word>>24)&0xff,
9250a306 725 $mnemonic,$arg;
5dcf70a1 726 }
9250a306
AP
727 }
728}
729
730foreach (split($/,$code)) {
731
732 s/\`([^\`]*)\`/eval $1/geo;
733
734 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
735
5dcf70a1 736 s/\bret\b/bx lr/go or
9250a306
AP
737 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
738
739 print $_,"\n";
740}
741
a21314db 742close STDOUT or die "error closing STDOUT: $!"; # enforce flush