crypto/sha/asm/sha256-armv4.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 #
  16 # Permission to use under GPL terms is granted.
  17 # ====================================================================
  18
  19 # SHA256 block procedure for ARMv4. May 2007.
  20
  21 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
  22 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
  23 # byte [on single-issue Xscale PXA250 core].
  24
  25 # July 2010.
  26 #
  27 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
  28 # Cortex A8 core and ~20 cycles per processed byte.
  29
  30 # February 2011.
  31 #
  32 # Profiler-assisted and platform-specific optimization resulted in 16%
  33 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
  34
  35 # September 2013.
  36 #
  37 # Add NEON implementation. On Cortex A8 it was measured to process one
  38 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
  39 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
  40 # code (meaning that latter performs sub-optimally, nothing was done
  41 # about it).
  42
  43 # May 2014.
  44 #
  45 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
  46
  47 # $output is the last argument if it looks like a file (it has an extension)
  48 # $flavour is the first argument if it doesn't look like a file
  49 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  50 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  51
  52 if ($flavour && $flavour ne "void") {
  53     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  54     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  55     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  56     die "can't locate arm-xlate.pl";
  57
  58     open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
  59         or die "can't call $xlate: $!";
  60 } else {
  61     $output and open STDOUT,">$output";
  62 }
  63
  64 $ctx="r0";      $t0="r0";
  65 $inp="r1";      $t4="r1";
  66 $len="r2";      $t1="r2";
  67 $T1="r3";       $t3="r3";
  68 $A="r4";
  69 $B="r5";
  70 $C="r6";
  71 $D="r7";
  72 $E="r8";
  73 $F="r9";
  74 $G="r10";
  75 $H="r11";
  76 @V=($A,$B,$C,$D,$E,$F,$G,$H);
  77 $t2="r12";
  78 $Ktbl="r14";
  79
  80 @Sigma0=( 2,13,22);
  81 @Sigma1=( 6,11,25);
  82 @sigma0=( 7,18, 3);
  83 @sigma1=(17,19,10);
  84
  85 sub BODY_00_15 {
  86 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  87
  88 $code.=<<___ if ($i<16);
  89 #if __ARM_ARCH__>=7
  90         @ ldr   $t1,[$inp],#4                   @ $i
  91 # if $i==15
  92         str     $inp,[sp,#17*4]                 @ make room for $t4
  93 # endif
  94         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  95         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
  96         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
  97 # ifndef __ARMEB__
  98         rev     $t1,$t1
  99 # endif
 100 #else
 101         @ ldrb  $t1,[$inp,#3]                   @ $i
 102         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
 103         ldrb    $t2,[$inp,#2]
 104         ldrb    $t0,[$inp,#1]
 105         orr     $t1,$t1,$t2,lsl#8
 106         ldrb    $t2,[$inp],#4
 107         orr     $t1,$t1,$t0,lsl#16
 108 # if $i==15
 109         str     $inp,[sp,#17*4]                 @ make room for $t4
 110 # endif
 111         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
 112         orr     $t1,$t1,$t2,lsl#24
 113         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
 114 #endif
 115 ___
 116 $code.=<<___;
 117         ldr     $t2,[$Ktbl],#4                  @ *K256++
 118         add     $h,$h,$t1                       @ h+=X[i]
 119         str     $t1,[sp,#`$i%16`*4]
 120         eor     $t1,$f,$g
 121         add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
 122         and     $t1,$t1,$e
 123         add     $h,$h,$t2                       @ h+=K256[i]
 124         eor     $t1,$t1,$g                      @ Ch(e,f,g)
 125         eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
 126         add     $h,$h,$t1                       @ h+=Ch(e,f,g)
 127 #if $i==31
 128         and     $t2,$t2,#0xff
 129         cmp     $t2,#0xf2                       @ done?
 130 #endif
 131 #if $i<15
 132 # if __ARM_ARCH__>=7
 133         ldr     $t1,[$inp],#4                   @ prefetch
 134 # else
 135         ldrb    $t1,[$inp,#3]
 136 # endif
 137         eor     $t2,$a,$b                       @ a^b, b^c in next round
 138 #else
 139         ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
 140         eor     $t2,$a,$b                       @ a^b, b^c in next round
 141         ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
 142 #endif
 143         eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
 144         and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
 145         add     $d,$d,$h                        @ d+=h
 146         eor     $t3,$t3,$b                      @ Maj(a,b,c)
 147         add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
 148         @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
 149 ___
 150         ($t2,$t3)=($t3,$t2);
 151 }
 152
 153 sub BODY_16_XX {
 154 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 155
 156 $code.=<<___;
 157         @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
 158         @ ldr   $t4,[sp,#`($i+14)%16`*4]
 159         mov     $t0,$t1,ror#$sigma0[0]
 160         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
 161         mov     $t2,$t4,ror#$sigma1[0]
 162         eor     $t0,$t0,$t1,ror#$sigma0[1]
 163         eor     $t2,$t2,$t4,ror#$sigma1[1]
 164         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
 165         ldr     $t1,[sp,#`($i+0)%16`*4]
 166         eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
 167         ldr     $t4,[sp,#`($i+9)%16`*4]
 168
 169         add     $t2,$t2,$t0
 170         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
 171         add     $t1,$t1,$t2
 172         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
 173         add     $t1,$t1,$t4                     @ X[i]
 174 ___
 175         &BODY_00_15(@_);
 176 }
 177
 178 $code=<<___;
 179 #ifndef __KERNEL__
 180 # include "arm_arch.h"
 181 #else
 182 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
 183 # define __ARM_MAX_ARCH__ 7
 184 #endif
 185
 186 #if defined(__thumb2__)
 187 .syntax unified
 188 .thumb
 189 #else
 190 .code   32
 191 #endif
 192
 193 .text
 194
 195 .type   K256,%object
 196 .align  5
 197 K256:
 198 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 199 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 200 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 201 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 202 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 203 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 204 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 205 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 206 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 207 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 208 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 209 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 210 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 211 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 212 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 213 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 214 .size   K256,.-K256
 215 .word   0                               @ terminator
 216 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 217 .LOPENSSL_armcap:
 218 # ifdef _WIN32
 219 .word   OPENSSL_armcap_P
 220 # else
 221 .word   OPENSSL_armcap_P-.Lsha256_block_data_order
 222 # endif
 223 #endif
 224 .align  5
 225
 226 .global sha256_block_data_order
 227 .type   sha256_block_data_order,%function
 228 sha256_block_data_order:
 229 .Lsha256_block_data_order:
 230 #if __ARM_ARCH__<7 && !defined(__thumb2__)
 231         sub     r3,pc,#8                @ sha256_block_data_order
 232 #else
 233         adr     r3,.Lsha256_block_data_order
 234 #endif
 235 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 236         ldr     r12,.LOPENSSL_armcap
 237 # if !defined(_WIN32)
 238         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
 239 # endif
 240 # if defined(__APPLE__) || defined(_WIN32)
 241         ldr     r12,[r12]
 242 # endif
 243         tst     r12,#ARMV8_SHA256
 244         bne     .LARMv8
 245         tst     r12,#ARMV7_NEON
 246         bne     .LNEON
 247 #endif
 248         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
 249         stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
 250         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
 251         sub     $Ktbl,r3,#256+32        @ K256
 252         sub     sp,sp,#16*4             @ alloca(X[16])
 253 .Loop:
 254 # if __ARM_ARCH__>=7
 255         ldr     $t1,[$inp],#4
 256 # else
 257         ldrb    $t1,[$inp,#3]
 258 # endif
 259         eor     $t3,$B,$C               @ magic
 260         eor     $t2,$t2,$t2
 261 ___
 262 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
 263 $code.=".Lrounds_16_xx:\n";
 264 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
 265 $code.=<<___;
 266 #ifdef  __thumb2__
 267         ite     eq                      @ Thumb2 thing, sanity check in ARM
 268 #endif
 269         ldreq   $t3,[sp,#16*4]          @ pull ctx
 270         bne     .Lrounds_16_xx
 271
 272         add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
 273         ldr     $t0,[$t3,#0]
 274         ldr     $t1,[$t3,#4]
 275         ldr     $t2,[$t3,#8]
 276         add     $A,$A,$t0
 277         ldr     $t0,[$t3,#12]
 278         add     $B,$B,$t1
 279         ldr     $t1,[$t3,#16]
 280         add     $C,$C,$t2
 281         ldr     $t2,[$t3,#20]
 282         add     $D,$D,$t0
 283         ldr     $t0,[$t3,#24]
 284         add     $E,$E,$t1
 285         ldr     $t1,[$t3,#28]
 286         add     $F,$F,$t2
 287         ldr     $inp,[sp,#17*4]         @ pull inp
 288         ldr     $t2,[sp,#18*4]          @ pull inp+len
 289         add     $G,$G,$t0
 290         add     $H,$H,$t1
 291         stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
 292         cmp     $inp,$t2
 293         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
 294         bne     .Loop
 295
 296         add     sp,sp,#`16+3`*4 @ destroy frame
 297 #if __ARM_ARCH__>=5
 298         ldmia   sp!,{r4-r11,pc}
 299 #else
 300         ldmia   sp!,{r4-r11,lr}
 301         tst     lr,#1
 302         moveq   pc,lr                   @ be binary compatible with V4, yet
 303         bx      lr                      @ interoperable with Thumb ISA:-)
 304 #endif
 305 .size   sha256_block_data_order,.-sha256_block_data_order
 306 ___
 307 ######################################################################
 308 # NEON stuff
 309 #
 310 {{{
 311 my @X=map("q$_",(0..3));
 312 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
 313 my $Xfer=$t4;
 314 my $j=0;
 315
 316 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
 317 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
 318
 319 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
 320 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
 321   my $arg = pop;
 322     $arg = "#$arg" if ($arg*1 eq $arg);
 323     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
 324 }
 325
 326 sub Xupdate()
 327 { use integer;
 328   my $body = shift;
 329   my @insns = (&$body,&$body,&$body,&$body);
 330   my ($a,$b,$c,$d,$e,$f,$g,$h);
 331
 332         &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
 333          eval(shift(@insns));
 334          eval(shift(@insns));
 335          eval(shift(@insns));
 336         &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
 337          eval(shift(@insns));
 338          eval(shift(@insns));
 339          eval(shift(@insns));
 340         &vshr_u32       ($T2,$T0,$sigma0[0]);
 341          eval(shift(@insns));
 342          eval(shift(@insns));
 343         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
 344          eval(shift(@insns));
 345          eval(shift(@insns));
 346         &vshr_u32       ($T1,$T0,$sigma0[2]);
 347          eval(shift(@insns));
 348          eval(shift(@insns));
 349         &vsli_32        ($T2,$T0,32-$sigma0[0]);
 350          eval(shift(@insns));
 351          eval(shift(@insns));
 352         &vshr_u32       ($T3,$T0,$sigma0[1]);
 353          eval(shift(@insns));
 354          eval(shift(@insns));
 355         &veor           ($T1,$T1,$T2);
 356          eval(shift(@insns));
 357          eval(shift(@insns));
 358         &vsli_32        ($T3,$T0,32-$sigma0[1]);
 359          eval(shift(@insns));
 360          eval(shift(@insns));
 361           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
 362          eval(shift(@insns));
 363          eval(shift(@insns));
 364         &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
 365          eval(shift(@insns));
 366          eval(shift(@insns));
 367           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
 368          eval(shift(@insns));
 369          eval(shift(@insns));
 370           &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
 371          eval(shift(@insns));
 372          eval(shift(@insns));
 373         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
 374          eval(shift(@insns));
 375          eval(shift(@insns));
 376           &veor         ($T5,$T5,$T4);
 377          eval(shift(@insns));
 378          eval(shift(@insns));
 379           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
 380          eval(shift(@insns));
 381          eval(shift(@insns));
 382           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
 383          eval(shift(@insns));
 384          eval(shift(@insns));
 385           &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
 386          eval(shift(@insns));
 387          eval(shift(@insns));
 388         &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
 389          eval(shift(@insns));
 390          eval(shift(@insns));
 391           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
 392          eval(shift(@insns));
 393          eval(shift(@insns));
 394           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
 395          eval(shift(@insns));
 396          eval(shift(@insns));
 397           &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
 398          eval(shift(@insns));
 399          eval(shift(@insns));
 400           &veor         ($T5,$T5,$T4);
 401          eval(shift(@insns));
 402          eval(shift(@insns));
 403           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
 404          eval(shift(@insns));
 405          eval(shift(@insns));
 406         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
 407          eval(shift(@insns));
 408          eval(shift(@insns));
 409           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
 410          eval(shift(@insns));
 411          eval(shift(@insns));
 412           &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
 413          eval(shift(@insns));
 414          eval(shift(@insns));
 415         &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
 416          eval(shift(@insns));
 417          eval(shift(@insns));
 418         &vadd_i32       ($T0,$T0,@X[0]);
 419          while($#insns>=2) { eval(shift(@insns)); }
 420         &vst1_32        ("{$T0}","[$Xfer,:128]!");
 421          eval(shift(@insns));
 422          eval(shift(@insns));
 423
 424         push(@X,shift(@X));             # "rotate" X[]
 425 }
 426
 427 sub Xpreload()
 428 { use integer;
 429   my $body = shift;
 430   my @insns = (&$body,&$body,&$body,&$body);
 431   my ($a,$b,$c,$d,$e,$f,$g,$h);
 432
 433          eval(shift(@insns));
 434          eval(shift(@insns));
 435          eval(shift(@insns));
 436          eval(shift(@insns));
 437         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
 438          eval(shift(@insns));
 439          eval(shift(@insns));
 440          eval(shift(@insns));
 441          eval(shift(@insns));
 442         &vrev32_8       (@X[0],@X[0]);
 443          eval(shift(@insns));
 444          eval(shift(@insns));
 445          eval(shift(@insns));
 446          eval(shift(@insns));
 447         &vadd_i32       ($T0,$T0,@X[0]);
 448          foreach (@insns) { eval; }     # remaining instructions
 449         &vst1_32        ("{$T0}","[$Xfer,:128]!");
 450
 451         push(@X,shift(@X));             # "rotate" X[]
 452 }
 453
 454 sub body_00_15 () {
 455         (
 456         '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
 457         '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
 458         '&eor   ($t1,$f,$g)',
 459         '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
 460         '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
 461         '&and   ($t1,$t1,$e)',
 462         '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
 463         '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
 464         '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
 465         '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
 466         '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
 467         '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
 468         '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
 469         '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
 470         '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
 471         '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
 472         '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
 473         '&add   ($d,$d,$h)',                    # d+=h
 474         '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
 475         '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
 476         '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
 477         )
 478 }
 479
 480 $code.=<<___;
 481 #if __ARM_MAX_ARCH__>=7
 482 .arch   armv7-a
 483 .fpu    neon
 484
 485 .global sha256_block_data_order_neon
 486 .type   sha256_block_data_order_neon,%function
 487 .align  5
 488 .skip   16
 489 sha256_block_data_order_neon:
 490 .LNEON:
 491         stmdb   sp!,{r4-r12,lr}
 492
 493         sub     $H,sp,#16*4+16
 494         adr     $Ktbl,K256
 495         bic     $H,$H,#15               @ align for 128-bit stores
 496         mov     $t2,sp
 497         mov     sp,$H                   @ alloca
 498         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
 499
 500         vld1.8          {@X[0]},[$inp]!
 501         vld1.8          {@X[1]},[$inp]!
 502         vld1.8          {@X[2]},[$inp]!
 503         vld1.8          {@X[3]},[$inp]!
 504         vld1.32         {$T0},[$Ktbl,:128]!
 505         vld1.32         {$T1},[$Ktbl,:128]!
 506         vld1.32         {$T2},[$Ktbl,:128]!
 507         vld1.32         {$T3},[$Ktbl,:128]!
 508         vrev32.8        @X[0],@X[0]             @ yes, even on
 509         str             $ctx,[sp,#64]
 510         vrev32.8        @X[1],@X[1]             @ big-endian
 511         str             $inp,[sp,#68]
 512         mov             $Xfer,sp
 513         vrev32.8        @X[2],@X[2]
 514         str             $len,[sp,#72]
 515         vrev32.8        @X[3],@X[3]
 516         str             $t2,[sp,#76]            @ save original sp
 517         vadd.i32        $T0,$T0,@X[0]
 518         vadd.i32        $T1,$T1,@X[1]
 519         vst1.32         {$T0},[$Xfer,:128]!
 520         vadd.i32        $T2,$T2,@X[2]
 521         vst1.32         {$T1},[$Xfer,:128]!
 522         vadd.i32        $T3,$T3,@X[3]
 523         vst1.32         {$T2},[$Xfer,:128]!
 524         vst1.32         {$T3},[$Xfer,:128]!
 525
 526         ldmia           $ctx,{$A-$H}
 527         sub             $Xfer,$Xfer,#64
 528         ldr             $t1,[sp,#0]
 529         eor             $t2,$t2,$t2
 530         eor             $t3,$B,$C
 531         b               .L_00_48
 532
 533 .align  4
 534 .L_00_48:
 535 ___
 536         &Xupdate(\&body_00_15);
 537         &Xupdate(\&body_00_15);
 538         &Xupdate(\&body_00_15);
 539         &Xupdate(\&body_00_15);
 540 $code.=<<___;
 541         teq     $t1,#0                          @ check for K256 terminator
 542         ldr     $t1,[sp,#0]
 543         sub     $Xfer,$Xfer,#64
 544         bne     .L_00_48
 545
 546         ldr             $inp,[sp,#68]
 547         ldr             $t0,[sp,#72]
 548         sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
 549         teq             $inp,$t0
 550         it              eq
 551         subeq           $inp,$inp,#64           @ avoid SEGV
 552         vld1.8          {@X[0]},[$inp]!         @ load next input block
 553         vld1.8          {@X[1]},[$inp]!
 554         vld1.8          {@X[2]},[$inp]!
 555         vld1.8          {@X[3]},[$inp]!
 556         it              ne
 557         strne           $inp,[sp,#68]
 558         mov             $Xfer,sp
 559 ___
 560         &Xpreload(\&body_00_15);
 561         &Xpreload(\&body_00_15);
 562         &Xpreload(\&body_00_15);
 563         &Xpreload(\&body_00_15);
 564 $code.=<<___;
 565         ldr     $t0,[$t1,#0]
 566         add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
 567         ldr     $t2,[$t1,#4]
 568         ldr     $t3,[$t1,#8]
 569         ldr     $t4,[$t1,#12]
 570         add     $A,$A,$t0                       @ accumulate
 571         ldr     $t0,[$t1,#16]
 572         add     $B,$B,$t2
 573         ldr     $t2,[$t1,#20]
 574         add     $C,$C,$t3
 575         ldr     $t3,[$t1,#24]
 576         add     $D,$D,$t4
 577         ldr     $t4,[$t1,#28]
 578         add     $E,$E,$t0
 579         str     $A,[$t1],#4
 580         add     $F,$F,$t2
 581         str     $B,[$t1],#4
 582         add     $G,$G,$t3
 583         str     $C,[$t1],#4
 584         add     $H,$H,$t4
 585         str     $D,[$t1],#4
 586         stmia   $t1,{$E-$H}
 587
 588         ittte   ne
 589         movne   $Xfer,sp
 590         ldrne   $t1,[sp,#0]
 591         eorne   $t2,$t2,$t2
 592         ldreq   sp,[sp,#76]                     @ restore original sp
 593         itt     ne
 594         eorne   $t3,$B,$C
 595         bne     .L_00_48
 596
 597         ldmia   sp!,{r4-r12,pc}
 598 .size   sha256_block_data_order_neon,.-sha256_block_data_order_neon
 599 #endif
 600 ___
 601 }}}
 602 ######################################################################
 603 # ARMv8 stuff
 604 #
 605 {{{
 606 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
 607 my @MSG=map("q$_",(8..11));
 608 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
 609 my $Ktbl="r3";
 610 my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
 611
 612 $code.=<<___;
 613 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 614
 615 # if defined(__thumb2__)
 616 #  define INST(a,b,c,d) $_byte  c,d|0xc,a,b
 617 # else
 618 #  define INST(a,b,c,d) $_byte  a,b,c,d
 619 # endif
 620
 621 .type   sha256_block_data_order_armv8,%function
 622 .align  5
 623 sha256_block_data_order_armv8:
 624 .LARMv8:
 625         vld1.32 {$ABCD,$EFGH},[$ctx]
 626         sub     $Ktbl,$Ktbl,#256+32
 627         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
 628         b       .Loop_v8
 629
 630 .align  4
 631 .Loop_v8:
 632         vld1.8          {@MSG[0]-@MSG[1]},[$inp]!
 633         vld1.8          {@MSG[2]-@MSG[3]},[$inp]!
 634         vld1.32         {$W0},[$Ktbl]!
 635         vrev32.8        @MSG[0],@MSG[0]
 636         vrev32.8        @MSG[1],@MSG[1]
 637         vrev32.8        @MSG[2],@MSG[2]
 638         vrev32.8        @MSG[3],@MSG[3]
 639         vmov            $ABCD_SAVE,$ABCD        @ offload
 640         vmov            $EFGH_SAVE,$EFGH
 641         teq             $inp,$len
 642 ___
 643 for($i=0;$i<12;$i++) {
 644 $code.=<<___;
 645         vld1.32         {$W1},[$Ktbl]!
 646         vadd.i32        $W0,$W0,@MSG[0]
 647         sha256su0       @MSG[0],@MSG[1]
 648         vmov            $abcd,$ABCD
 649         sha256h         $ABCD,$EFGH,$W0
 650         sha256h2        $EFGH,$abcd,$W0
 651         sha256su1       @MSG[0],@MSG[2],@MSG[3]
 652 ___
 653         ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
 654 }
 655 $code.=<<___;
 656         vld1.32         {$W1},[$Ktbl]!
 657         vadd.i32        $W0,$W0,@MSG[0]
 658         vmov            $abcd,$ABCD
 659         sha256h         $ABCD,$EFGH,$W0
 660         sha256h2        $EFGH,$abcd,$W0
 661
 662         vld1.32         {$W0},[$Ktbl]!
 663         vadd.i32        $W1,$W1,@MSG[1]
 664         vmov            $abcd,$ABCD
 665         sha256h         $ABCD,$EFGH,$W1
 666         sha256h2        $EFGH,$abcd,$W1
 667
 668         vld1.32         {$W1},[$Ktbl]
 669         vadd.i32        $W0,$W0,@MSG[2]
 670         sub             $Ktbl,$Ktbl,#256-16     @ rewind
 671         vmov            $abcd,$ABCD
 672         sha256h         $ABCD,$EFGH,$W0
 673         sha256h2        $EFGH,$abcd,$W0
 674
 675         vadd.i32        $W1,$W1,@MSG[3]
 676         vmov            $abcd,$ABCD
 677         sha256h         $ABCD,$EFGH,$W1
 678         sha256h2        $EFGH,$abcd,$W1
 679
 680         vadd.i32        $ABCD,$ABCD,$ABCD_SAVE
 681         vadd.i32        $EFGH,$EFGH,$EFGH_SAVE
 682         it              ne
 683         bne             .Loop_v8
 684
 685         vst1.32         {$ABCD,$EFGH},[$ctx]
 686
 687         ret             @ bx lr
 688 .size   sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
 689 #endif
 690 ___
 691 }}}
 692 $code.=<<___;
 693 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 694 .align  2
 695 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 696 .comm   OPENSSL_armcap_P,4,4
 697 #endif
 698 ___
 699
 700 open SELF,$0;
 701 while(<SELF>) {
 702         next if (/^#!/);
 703         last if (!s/^#/@/ and !/^$/);
 704         print;
 705 }
 706 close SELF;
 707
 708 {   my  %opcode = (
 709         "sha256h"       => 0xf3000c40,  "sha256h2"      => 0xf3100c40,
 710         "sha256su0"     => 0xf3ba03c0,  "sha256su1"     => 0xf3200c40   );
 711
 712     sub unsha256 {
 713         my ($mnemonic,$arg)=@_;
 714
 715         if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
 716             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
 717                                          |(($2&7)<<17)|(($2&8)<<4)
 718                                          |(($3&7)<<1) |(($3&8)<<2);
 719             # since ARMv7 instructions are always encoded little-endian.
 720             # correct solution is to use .inst directive, but older
 721             # assemblers don't implement it:-(
 722             sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
 723                         $word&0xff,($word>>8)&0xff,
 724                         ($word>>16)&0xff,($word>>24)&0xff,
 725                         $mnemonic,$arg;
 726         }
 727     }
 728 }
 729
 730 foreach (split($/,$code)) {
 731
 732         s/\`([^\`]*)\`/eval $1/geo;
 733
 734         s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
 735
 736         s/\bret\b/bx    lr/go           or
 737         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
 738
 739         print $_,"\n";
 740 }
 741
 742 close STDOUT or die "error closing STDOUT: $!"; # enforce flush