crypto/aes/asm/aesv8-armx.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # This module implements support for ARMv8 AES instructions. The
  11 # module is endian-agnostic in sense that it supports both big- and
  12 # little-endian cases. As does it support both 32- and 64-bit modes
  13 # of operation. Latter is achieved by limiting amount of utilized
  14 # registers to 16, which implies additional NEON load and integer
  15 # instructions. This has no effect on mighty Apple A7, where results
  16 # are literally equal to the theoretical estimates based on AES
  17 # instruction latencies and issue rates. On Cortex-A53, an in-order
  18 # execution core, this costs up to 10-15%, which is partially
  19 # compensated by implementing dedicated code path for 128-bit
  20 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
  21 # seems to be limited by sheer amount of NEON instructions...
  22 #
  23 # Performance in cycles per byte processed with 128-bit key:
  24 #
  25 #               CBC enc         CBC dec         CTR
  26 # Apple A7      2.39            1.20            1.20
  27 # Cortex-A53    2.45            1.87            1.94
  28 # Cortex-A57    3.64            1.34            1.32
  29
  30 $flavour = shift;
  31 open STDOUT,">".shift;
  32
  33 $prefix="aes_v8";
  34
  35 $code=<<___;
  36 #include "arm_arch.h"
  37
  38 #if __ARM_MAX_ARCH__>=7
  39 .text
  40 ___
  41 $code.=".arch   armv8-a+crypto\n"                       if ($flavour =~ /64/);
  42 $code.=".arch   armv7-a\n.fpu   neon\n.code     32\n"   if ($flavour !~ /64/);
  43                 #^^^^^^ this is done to simplify adoption by not depending
  44                 #       on latest binutils.
  45
  46 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
  47 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
  48 # maintain both 32- and 64-bit codes within single module and
  49 # transliterate common code to either flavour with regex vodoo.
  50 #
  51 {{{
  52 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
  53 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
  54         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
  55
  56
  57 $code.=<<___;
  58 .align  5
  59 rcon:
  60 .long   0x01,0x01,0x01,0x01
  61 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
  62 .long   0x1b,0x1b,0x1b,0x1b
  63
  64 .globl  ${prefix}_set_encrypt_key
  65 .type   ${prefix}_set_encrypt_key,%function
  66 .align  5
  67 ${prefix}_set_encrypt_key:
  68 .Lenc_key:
  69 ___
  70 $code.=<<___    if ($flavour =~ /64/);
  71         stp     x29,x30,[sp,#-16]!
  72         add     x29,sp,#0
  73 ___
  74 $code.=<<___;
  75         mov     $ptr,#-1
  76         cmp     $inp,#0
  77         b.eq    .Lenc_key_abort
  78         cmp     $out,#0
  79         b.eq    .Lenc_key_abort
  80         mov     $ptr,#-2
  81         cmp     $bits,#128
  82         b.lt    .Lenc_key_abort
  83         cmp     $bits,#256
  84         b.gt    .Lenc_key_abort
  85         tst     $bits,#0x3f
  86         b.ne    .Lenc_key_abort
  87
  88         adr     $ptr,rcon
  89         cmp     $bits,#192
  90
  91         veor    $zero,$zero,$zero
  92         vld1.8  {$in0},[$inp],#16
  93         mov     $bits,#8                // reuse $bits
  94         vld1.32 {$rcon,$mask},[$ptr],#32
  95
  96         b.lt    .Loop128
  97         b.eq    .L192
  98         b       .L256
  99
 100 .align  4
 101 .Loop128:
 102         vtbl.8  $key,{$in0},$mask
 103         vext.8  $tmp,$zero,$in0,#12
 104         vst1.32 {$in0},[$out],#16
 105         aese    $key,$zero
 106         subs    $bits,$bits,#1
 107
 108         veor    $in0,$in0,$tmp
 109         vext.8  $tmp,$zero,$tmp,#12
 110         veor    $in0,$in0,$tmp
 111         vext.8  $tmp,$zero,$tmp,#12
 112          veor   $key,$key,$rcon
 113         veor    $in0,$in0,$tmp
 114         vshl.u8 $rcon,$rcon,#1
 115         veor    $in0,$in0,$key
 116         b.ne    .Loop128
 117
 118         vld1.32 {$rcon},[$ptr]
 119
 120         vtbl.8  $key,{$in0},$mask
 121         vext.8  $tmp,$zero,$in0,#12
 122         vst1.32 {$in0},[$out],#16
 123         aese    $key,$zero
 124
 125         veor    $in0,$in0,$tmp
 126         vext.8  $tmp,$zero,$tmp,#12
 127         veor    $in0,$in0,$tmp
 128         vext.8  $tmp,$zero,$tmp,#12
 129          veor   $key,$key,$rcon
 130         veor    $in0,$in0,$tmp
 131         vshl.u8 $rcon,$rcon,#1
 132         veor    $in0,$in0,$key
 133
 134         vtbl.8  $key,{$in0},$mask
 135         vext.8  $tmp,$zero,$in0,#12
 136         vst1.32 {$in0},[$out],#16
 137         aese    $key,$zero
 138
 139         veor    $in0,$in0,$tmp
 140         vext.8  $tmp,$zero,$tmp,#12
 141         veor    $in0,$in0,$tmp
 142         vext.8  $tmp,$zero,$tmp,#12
 143          veor   $key,$key,$rcon
 144         veor    $in0,$in0,$tmp
 145         veor    $in0,$in0,$key
 146         vst1.32 {$in0},[$out]
 147         add     $out,$out,#0x50
 148
 149         mov     $rounds,#10
 150         b       .Ldone
 151
 152 .align  4
 153 .L192:
 154         vld1.8  {$in1},[$inp],#8
 155         vmov.i8 $key,#8                 // borrow $key
 156         vst1.32 {$in0},[$out],#16
 157         vsub.i8 $mask,$mask,$key        // adjust the mask
 158
 159 .Loop192:
 160         vtbl.8  $key,{$in1},$mask
 161         vext.8  $tmp,$zero,$in0,#12
 162         vst1.32 {$in1},[$out],#8
 163         aese    $key,$zero
 164         subs    $bits,$bits,#1
 165
 166         veor    $in0,$in0,$tmp
 167         vext.8  $tmp,$zero,$tmp,#12
 168         veor    $in0,$in0,$tmp
 169         vext.8  $tmp,$zero,$tmp,#12
 170         veor    $in0,$in0,$tmp
 171
 172         vdup.32 $tmp,${in0}[3]
 173         veor    $tmp,$tmp,$in1
 174          veor   $key,$key,$rcon
 175         vext.8  $in1,$zero,$in1,#12
 176         vshl.u8 $rcon,$rcon,#1
 177         veor    $in1,$in1,$tmp
 178         veor    $in0,$in0,$key
 179         veor    $in1,$in1,$key
 180         vst1.32 {$in0},[$out],#16
 181         b.ne    .Loop192
 182
 183         mov     $rounds,#12
 184         add     $out,$out,#0x20
 185         b       .Ldone
 186
 187 .align  4
 188 .L256:
 189         vld1.8  {$in1},[$inp]
 190         mov     $bits,#7
 191         mov     $rounds,#14
 192         vst1.32 {$in0},[$out],#16
 193
 194 .Loop256:
 195         vtbl.8  $key,{$in1},$mask
 196         vext.8  $tmp,$zero,$in0,#12
 197         vst1.32 {$in1},[$out],#16
 198         aese    $key,$zero
 199         subs    $bits,$bits,#1
 200
 201         veor    $in0,$in0,$tmp
 202         vext.8  $tmp,$zero,$tmp,#12
 203         veor    $in0,$in0,$tmp
 204         vext.8  $tmp,$zero,$tmp,#12
 205          veor   $key,$key,$rcon
 206         veor    $in0,$in0,$tmp
 207         vshl.u8 $rcon,$rcon,#1
 208         veor    $in0,$in0,$key
 209         vst1.32 {$in0},[$out],#16
 210         b.eq    .Ldone
 211
 212         vdup.32 $key,${in0}[3]          // just splat
 213         vext.8  $tmp,$zero,$in1,#12
 214         aese    $key,$zero
 215
 216         veor    $in1,$in1,$tmp
 217         vext.8  $tmp,$zero,$tmp,#12
 218         veor    $in1,$in1,$tmp
 219         vext.8  $tmp,$zero,$tmp,#12
 220         veor    $in1,$in1,$tmp
 221
 222         veor    $in1,$in1,$key
 223         b       .Loop256
 224
 225 .Ldone:
 226         str     $rounds,[$out]
 227         mov     $ptr,#0
 228
 229 .Lenc_key_abort:
 230         mov     x0,$ptr                 // return value
 231         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
 232         ret
 233 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
 234
 235 .globl  ${prefix}_set_decrypt_key
 236 .type   ${prefix}_set_decrypt_key,%function
 237 .align  5
 238 ${prefix}_set_decrypt_key:
 239 ___
 240 $code.=<<___    if ($flavour =~ /64/);
 241         stp     x29,x30,[sp,#-16]!
 242         add     x29,sp,#0
 243 ___
 244 $code.=<<___    if ($flavour !~ /64/);
 245         stmdb   sp!,{r4,lr}
 246 ___
 247 $code.=<<___;
 248         bl      .Lenc_key
 249
 250         cmp     x0,#0
 251         b.ne    .Ldec_key_abort
 252
 253         sub     $out,$out,#240          // restore original $out
 254         mov     x4,#-16
 255         add     $inp,$out,x12,lsl#4     // end of key schedule
 256
 257         vld1.32 {v0.16b},[$out]
 258         vld1.32 {v1.16b},[$inp]
 259         vst1.32 {v0.16b},[$inp],x4
 260         vst1.32 {v1.16b},[$out],#16
 261
 262 .Loop_imc:
 263         vld1.32 {v0.16b},[$out]
 264         vld1.32 {v1.16b},[$inp]
 265         aesimc  v0.16b,v0.16b
 266         aesimc  v1.16b,v1.16b
 267         vst1.32 {v0.16b},[$inp],x4
 268         vst1.32 {v1.16b},[$out],#16
 269         cmp     $inp,$out
 270         b.hi    .Loop_imc
 271
 272         vld1.32 {v0.16b},[$out]
 273         aesimc  v0.16b,v0.16b
 274         vst1.32 {v0.16b},[$inp]
 275
 276         eor     x0,x0,x0                // return value
 277 .Ldec_key_abort:
 278 ___
 279 $code.=<<___    if ($flavour !~ /64/);
 280         ldmia   sp!,{r4,pc}
 281 ___
 282 $code.=<<___    if ($flavour =~ /64/);
 283         ldp     x29,x30,[sp],#16
 284         ret
 285 ___
 286 $code.=<<___;
 287 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
 288 ___
 289 }}}
 290 {{{
 291 sub gen_block () {
 292 my $dir = shift;
 293 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
 294 my ($inp,$out,$key)=map("x$_",(0..2));
 295 my $rounds="w3";
 296 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
 297
 298 $code.=<<___;
 299 .globl  ${prefix}_${dir}crypt
 300 .type   ${prefix}_${dir}crypt,%function
 301 .align  5
 302 ${prefix}_${dir}crypt:
 303         ldr     $rounds,[$key,#240]
 304         vld1.32 {$rndkey0},[$key],#16
 305         vld1.8  {$inout},[$inp]
 306         sub     $rounds,$rounds,#2
 307         vld1.32 {$rndkey1},[$key],#16
 308
 309 .Loop_${dir}c:
 310         aes$e   $inout,$rndkey0
 311         vld1.32 {$rndkey0},[$key],#16
 312         aes$mc  $inout,$inout
 313         subs    $rounds,$rounds,#2
 314         aes$e   $inout,$rndkey1
 315         vld1.32 {$rndkey1},[$key],#16
 316         aes$mc  $inout,$inout
 317         b.gt    .Loop_${dir}c
 318
 319         aes$e   $inout,$rndkey0
 320         vld1.32 {$rndkey0},[$key]
 321         aes$mc  $inout,$inout
 322         aes$e   $inout,$rndkey1
 323         veor    $inout,$inout,$rndkey0
 324
 325         vst1.8  {$inout},[$out]
 326         ret
 327 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
 328 ___
 329 }
 330 &gen_block("en");
 331 &gen_block("de");
 332 }}}
 333 {{{
 334 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
 335 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
 336 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
 337
 338 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
 339
 340 ### q8-q15      preloaded key schedule
 341
 342 $code.=<<___;
 343 .globl  ${prefix}_cbc_encrypt
 344 .type   ${prefix}_cbc_encrypt,%function
 345 .align  5
 346 ${prefix}_cbc_encrypt:
 347 ___
 348 $code.=<<___    if ($flavour =~ /64/);
 349         stp     x29,x30,[sp,#-16]!
 350         add     x29,sp,#0
 351 ___
 352 $code.=<<___    if ($flavour !~ /64/);
 353         mov     ip,sp
 354         stmdb   sp!,{r4-r8,lr}
 355         vstmdb  sp!,{d8-d15}            @ ABI specification says so
 356         ldmia   ip,{r4-r5}              @ load remaining args
 357 ___
 358 $code.=<<___;
 359         subs    $len,$len,#16
 360         mov     $step,#16
 361         b.lo    .Lcbc_abort
 362         cclr    $step,eq
 363
 364         cmp     $enc,#0                 // en- or decrypting?
 365         ldr     $rounds,[$key,#240]
 366         and     $len,$len,#-16
 367         vld1.8  {$ivec},[$ivp]
 368         vld1.8  {$dat},[$inp],$step
 369
 370         vld1.32 {q8-q9},[$key]          // load key schedule...
 371         sub     $rounds,$rounds,#6
 372         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
 373         sub     $rounds,$rounds,#2
 374         vld1.32 {q10-q11},[$key_],#32
 375         vld1.32 {q12-q13},[$key_],#32
 376         vld1.32 {q14-q15},[$key_],#32
 377         vld1.32 {$rndlast},[$key_]
 378
 379         add     $key_,$key,#32
 380         mov     $cnt,$rounds
 381         b.eq    .Lcbc_dec
 382
 383         cmp     $rounds,#2
 384         veor    $dat,$dat,$ivec
 385         veor    $rndzero_n_last,q8,$rndlast
 386         b.eq    .Lcbc_enc128
 387
 388 .Loop_cbc_enc:
 389         aese    $dat,q8
 390         vld1.32 {q8},[$key_],#16
 391         aesmc   $dat,$dat
 392         subs    $cnt,$cnt,#2
 393         aese    $dat,q9
 394         vld1.32 {q9},[$key_],#16
 395         aesmc   $dat,$dat
 396         b.gt    .Loop_cbc_enc
 397
 398         aese    $dat,q8
 399         aesmc   $dat,$dat
 400          subs   $len,$len,#16
 401         aese    $dat,q9
 402         aesmc   $dat,$dat
 403          cclr   $step,eq
 404         aese    $dat,q10
 405         aesmc   $dat,$dat
 406          add    $key_,$key,#16
 407         aese    $dat,q11
 408         aesmc   $dat,$dat
 409          vld1.8 {q8},[$inp],$step
 410         aese    $dat,q12
 411         aesmc   $dat,$dat
 412          veor   q8,q8,$rndzero_n_last
 413         aese    $dat,q13
 414         aesmc   $dat,$dat
 415          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
 416         aese    $dat,q14
 417         aesmc   $dat,$dat
 418         aese    $dat,q15
 419
 420          mov    $cnt,$rounds
 421         veor    $ivec,$dat,$rndlast
 422         vst1.8  {$ivec},[$out],#16
 423         b.hs    .Loop_cbc_enc
 424
 425         b       .Lcbc_done
 426
 427 .align  5
 428 .Lcbc_enc128:
 429         vld1.32 {$in0-$in1},[$key_]
 430         aese    $dat,q8
 431         aesmc   $dat,$dat
 432         b       .Lenter_cbc_enc128
 433 .Loop_cbc_enc128:
 434         aese    $dat,q8
 435         aesmc   $dat,$dat
 436          vst1.8 {$ivec},[$out],#16
 437 .Lenter_cbc_enc128:
 438         aese    $dat,q9
 439         aesmc   $dat,$dat
 440          subs   $len,$len,#16
 441         aese    $dat,$in0
 442         aesmc   $dat,$dat
 443          cclr   $step,eq
 444         aese    $dat,$in1
 445         aesmc   $dat,$dat
 446         aese    $dat,q10
 447         aesmc   $dat,$dat
 448         aese    $dat,q11
 449         aesmc   $dat,$dat
 450          vld1.8 {q8},[$inp],$step
 451         aese    $dat,q12
 452         aesmc   $dat,$dat
 453         aese    $dat,q13
 454         aesmc   $dat,$dat
 455         aese    $dat,q14
 456         aesmc   $dat,$dat
 457          veor   q8,q8,$rndzero_n_last
 458         aese    $dat,q15
 459         veor    $ivec,$dat,$rndlast
 460         b.hs    .Loop_cbc_enc128
 461
 462         vst1.8  {$ivec},[$out],#16
 463         b       .Lcbc_done
 464 ___
 465 {
 466 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
 467 $code.=<<___;
 468 .align  5
 469 .Lcbc_dec:
 470         vld1.8  {$dat2},[$inp],#16
 471         subs    $len,$len,#32           // bias
 472         add     $cnt,$rounds,#2
 473         vorr    $in1,$dat,$dat
 474         vorr    $dat1,$dat,$dat
 475         vorr    $in2,$dat2,$dat2
 476         b.lo    .Lcbc_dec_tail
 477
 478         vorr    $dat1,$dat2,$dat2
 479         vld1.8  {$dat2},[$inp],#16
 480         vorr    $in0,$dat,$dat
 481         vorr    $in1,$dat1,$dat1
 482         vorr    $in2,$dat2,$dat2
 483
 484 .Loop3x_cbc_dec:
 485         aesd    $dat0,q8
 486         aesd    $dat1,q8
 487         aesd    $dat2,q8
 488         vld1.32 {q8},[$key_],#16
 489         aesimc  $dat0,$dat0
 490         aesimc  $dat1,$dat1
 491         aesimc  $dat2,$dat2
 492         subs    $cnt,$cnt,#2
 493         aesd    $dat0,q9
 494         aesd    $dat1,q9
 495         aesd    $dat2,q9
 496         vld1.32 {q9},[$key_],#16
 497         aesimc  $dat0,$dat0
 498         aesimc  $dat1,$dat1
 499         aesimc  $dat2,$dat2
 500         b.gt    .Loop3x_cbc_dec
 501
 502         aesd    $dat0,q8
 503         aesd    $dat1,q8
 504         aesd    $dat2,q8
 505          veor   $tmp0,$ivec,$rndlast
 506         aesimc  $dat0,$dat0
 507         aesimc  $dat1,$dat1
 508         aesimc  $dat2,$dat2
 509          veor   $tmp1,$in0,$rndlast
 510         aesd    $dat0,q9
 511         aesd    $dat1,q9
 512         aesd    $dat2,q9
 513          veor   $tmp2,$in1,$rndlast
 514          subs   $len,$len,#0x30
 515         aesimc  $dat0,$dat0
 516         aesimc  $dat1,$dat1
 517         aesimc  $dat2,$dat2
 518          vorr   $ivec,$in2,$in2
 519          mov.lo x6,$len                 // x6, $cnt, is zero at this point
 520         aesd    $dat0,q12
 521         aesd    $dat1,q12
 522         aesd    $dat2,q12
 523          add    $inp,$inp,x6            // $inp is adjusted in such way that
 524                                         // at exit from the loop $dat1-$dat2
 525                                         // are loaded with last "words"
 526         aesimc  $dat0,$dat0
 527         aesimc  $dat1,$dat1
 528         aesimc  $dat2,$dat2
 529          mov    $key_,$key
 530         aesd    $dat0,q13
 531         aesd    $dat1,q13
 532         aesd    $dat2,q13
 533          vld1.8 {$in0},[$inp],#16
 534         aesimc  $dat0,$dat0
 535         aesimc  $dat1,$dat1
 536         aesimc  $dat2,$dat2
 537          vld1.8 {$in1},[$inp],#16
 538         aesd    $dat0,q14
 539         aesd    $dat1,q14
 540         aesd    $dat2,q14
 541          vld1.8 {$in2},[$inp],#16
 542         aesimc  $dat0,$dat0
 543         aesimc  $dat1,$dat1
 544         aesimc  $dat2,$dat2
 545          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
 546         aesd    $dat0,q15
 547         aesd    $dat1,q15
 548         aesd    $dat2,q15
 549
 550          add    $cnt,$rounds,#2
 551         veor    $tmp0,$tmp0,$dat0
 552         veor    $tmp1,$tmp1,$dat1
 553         veor    $dat2,$dat2,$tmp2
 554          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
 555          vorr   $dat0,$in0,$in0
 556         vst1.8  {$tmp0},[$out],#16
 557          vorr   $dat1,$in1,$in1
 558         vst1.8  {$tmp1},[$out],#16
 559         vst1.8  {$dat2},[$out],#16
 560          vorr   $dat2,$in2,$in2
 561         b.hs    .Loop3x_cbc_dec
 562
 563         cmn     $len,#0x30
 564         b.eq    .Lcbc_done
 565         nop
 566
 567 .Lcbc_dec_tail:
 568         aesd    $dat1,q8
 569         aesd    $dat2,q8
 570         vld1.32 {q8},[$key_],#16
 571         aesimc  $dat1,$dat1
 572         aesimc  $dat2,$dat2
 573         subs    $cnt,$cnt,#2
 574         aesd    $dat1,q9
 575         aesd    $dat2,q9
 576         vld1.32 {q9},[$key_],#16
 577         aesimc  $dat1,$dat1
 578         aesimc  $dat2,$dat2
 579         b.gt    .Lcbc_dec_tail
 580
 581         aesd    $dat1,q8
 582         aesd    $dat2,q8
 583         aesimc  $dat1,$dat1
 584         aesimc  $dat2,$dat2
 585         aesd    $dat1,q9
 586         aesd    $dat2,q9
 587         aesimc  $dat1,$dat1
 588         aesimc  $dat2,$dat2
 589         aesd    $dat1,q12
 590         aesd    $dat2,q12
 591         aesimc  $dat1,$dat1
 592         aesimc  $dat2,$dat2
 593          cmn    $len,#0x20
 594         aesd    $dat1,q13
 595         aesd    $dat2,q13
 596         aesimc  $dat1,$dat1
 597         aesimc  $dat2,$dat2
 598          veor   $tmp1,$ivec,$rndlast
 599         aesd    $dat1,q14
 600         aesd    $dat2,q14
 601         aesimc  $dat1,$dat1
 602         aesimc  $dat2,$dat2
 603          veor   $tmp2,$in1,$rndlast
 604         aesd    $dat1,q15
 605         aesd    $dat2,q15
 606         b.eq    .Lcbc_dec_one
 607         veor    $tmp1,$tmp1,$dat1
 608         veor    $tmp2,$tmp2,$dat2
 609          vorr   $ivec,$in2,$in2
 610         vst1.8  {$tmp1},[$out],#16
 611         vst1.8  {$tmp2},[$out],#16
 612         b       .Lcbc_done
 613
 614 .Lcbc_dec_one:
 615         veor    $tmp1,$tmp1,$dat2
 616          vorr   $ivec,$in2,$in2
 617         vst1.8  {$tmp1},[$out],#16
 618
 619 .Lcbc_done:
 620         vst1.8  {$ivec},[$ivp]
 621 .Lcbc_abort:
 622 ___
 623 }
 624 $code.=<<___    if ($flavour !~ /64/);
 625         vldmia  sp!,{d8-d15}
 626         ldmia   sp!,{r4-r8,pc}
 627 ___
 628 $code.=<<___    if ($flavour =~ /64/);
 629         ldr     x29,[sp],#16
 630         ret
 631 ___
 632 $code.=<<___;
 633 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
 634 ___
 635 }}}
 636 {{{
 637 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
 638 my ($rounds,$cnt,$key_)=("w5","w6","x7");
 639 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
 640 my $step="x12";         # aliases with $tctr2
 641
 642 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
 643 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
 644
 645 my ($dat,$tmp)=($dat0,$tmp0);
 646
 647 ### q8-q15      preloaded key schedule
 648
 649 $code.=<<___;
 650 .globl  ${prefix}_ctr32_encrypt_blocks
 651 .type   ${prefix}_ctr32_encrypt_blocks,%function
 652 .align  5
 653 ${prefix}_ctr32_encrypt_blocks:
 654 ___
 655 $code.=<<___    if ($flavour =~ /64/);
 656         stp             x29,x30,[sp,#-16]!
 657         add             x29,sp,#0
 658 ___
 659 $code.=<<___    if ($flavour !~ /64/);
 660         mov             ip,sp
 661         stmdb           sp!,{r4-r10,lr}
 662         vstmdb          sp!,{d8-d15}            @ ABI specification says so
 663         ldr             r4, [ip]                @ load remaining arg
 664 ___
 665 $code.=<<___;
 666         ldr             $rounds,[$key,#240]
 667
 668         ldr             $ctr, [$ivp, #12]
 669         vld1.32         {$dat0},[$ivp]
 670
 671         vld1.32         {q8-q9},[$key]          // load key schedule...
 672         sub             $rounds,$rounds,#4
 673         mov             $step,#16
 674         cmp             $len,#2
 675         add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
 676         sub             $rounds,$rounds,#2
 677         vld1.32         {q12-q13},[$key_],#32
 678         vld1.32         {q14-q15},[$key_],#32
 679         vld1.32         {$rndlast},[$key_]
 680         add             $key_,$key,#32
 681         mov             $cnt,$rounds
 682         cclr            $step,lo
 683 #ifndef __ARMEB__
 684         rev             $ctr, $ctr
 685 #endif
 686         vorr            $dat1,$dat0,$dat0
 687         add             $tctr1, $ctr, #1
 688         vorr            $dat2,$dat0,$dat0
 689         add             $ctr, $ctr, #2
 690         vorr            $ivec,$dat0,$dat0
 691         rev             $tctr1, $tctr1
 692         vmov.32         ${dat1}[3],$tctr1
 693         b.ls            .Lctr32_tail
 694         rev             $tctr2, $ctr
 695         sub             $len,$len,#3            // bias
 696         vmov.32         ${dat2}[3],$tctr2
 697         b               .Loop3x_ctr32
 698
 699 .align  4
 700 .Loop3x_ctr32:
 701         aese            $dat0,q8
 702         aese            $dat1,q8
 703         aese            $dat2,q8
 704         vld1.32         {q8},[$key_],#16
 705         aesmc           $dat0,$dat0
 706         aesmc           $dat1,$dat1
 707         aesmc           $dat2,$dat2
 708         subs            $cnt,$cnt,#2
 709         aese            $dat0,q9
 710         aese            $dat1,q9
 711         aese            $dat2,q9
 712         vld1.32         {q9},[$key_],#16
 713         aesmc           $dat0,$dat0
 714         aesmc           $dat1,$dat1
 715         aesmc           $dat2,$dat2
 716         b.gt            .Loop3x_ctr32
 717
 718         aese            $dat0,q8
 719         aese            $dat1,q8
 720         aese            $dat2,q8
 721          mov            $key_,$key
 722         aesmc           $tmp0,$dat0
 723          vld1.8         {$in0},[$inp],#16
 724         aesmc           $tmp1,$dat1
 725         aesmc           $dat2,$dat2
 726          vorr           $dat0,$ivec,$ivec
 727         aese            $tmp0,q9
 728          vld1.8         {$in1},[$inp],#16
 729         aese            $tmp1,q9
 730         aese            $dat2,q9
 731          vorr           $dat1,$ivec,$ivec
 732         aesmc           $tmp0,$tmp0
 733          vld1.8         {$in2},[$inp],#16
 734         aesmc           $tmp1,$tmp1
 735         aesmc           $tmp2,$dat2
 736          vorr           $dat2,$ivec,$ivec
 737          add            $tctr0,$ctr,#1
 738         aese            $tmp0,q12
 739         aese            $tmp1,q12
 740         aese            $tmp2,q12
 741          veor           $in0,$in0,$rndlast
 742          add            $tctr1,$ctr,#2
 743         aesmc           $tmp0,$tmp0
 744         aesmc           $tmp1,$tmp1
 745         aesmc           $tmp2,$tmp2
 746          veor           $in1,$in1,$rndlast
 747          add            $ctr,$ctr,#3
 748         aese            $tmp0,q13
 749         aese            $tmp1,q13
 750         aese            $tmp2,q13
 751          veor           $in2,$in2,$rndlast
 752          rev            $tctr0,$tctr0
 753         aesmc           $tmp0,$tmp0
 754          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
 755         aesmc           $tmp1,$tmp1
 756         aesmc           $tmp2,$tmp2
 757          vmov.32        ${dat0}[3], $tctr0
 758          rev            $tctr1,$tctr1
 759         aese            $tmp0,q14
 760         aese            $tmp1,q14
 761         aese            $tmp2,q14
 762          vmov.32        ${dat1}[3], $tctr1
 763          rev            $tctr2,$ctr
 764         aesmc           $tmp0,$tmp0
 765         aesmc           $tmp1,$tmp1
 766         aesmc           $tmp2,$tmp2
 767          vmov.32        ${dat2}[3], $tctr2
 768          subs           $len,$len,#3
 769         aese            $tmp0,q15
 770         aese            $tmp1,q15
 771         aese            $tmp2,q15
 772
 773          mov            $cnt,$rounds
 774         veor            $in0,$in0,$tmp0
 775         veor            $in1,$in1,$tmp1
 776         veor            $in2,$in2,$tmp2
 777          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
 778         vst1.8          {$in0},[$out],#16
 779         vst1.8          {$in1},[$out],#16
 780         vst1.8          {$in2},[$out],#16
 781         b.hs            .Loop3x_ctr32
 782
 783         adds            $len,$len,#3
 784         b.eq            .Lctr32_done
 785         cmp             $len,#1
 786         mov             $step,#16
 787         cclr            $step,eq
 788
 789 .Lctr32_tail:
 790         aese            $dat0,q8
 791         aese            $dat1,q8
 792         vld1.32         {q8},[$key_],#16
 793         aesmc           $dat0,$dat0
 794         aesmc           $dat1,$dat1
 795         subs            $cnt,$cnt,#2
 796         aese            $dat0,q9
 797         aese            $dat1,q9
 798         vld1.32         {q9},[$key_],#16
 799         aesmc           $dat0,$dat0
 800         aesmc           $dat1,$dat1
 801         b.gt            .Lctr32_tail
 802
 803         aese            $dat0,q8
 804         aese            $dat1,q8
 805         aesmc           $dat0,$dat0
 806         aesmc           $dat1,$dat1
 807         aese            $dat0,q9
 808         aese            $dat1,q9
 809         aesmc           $dat0,$dat0
 810         aesmc           $dat1,$dat1
 811          vld1.8         {$in0},[$inp],$step
 812         aese            $dat0,q12
 813         aese            $dat1,q12
 814          vld1.8         {$in1},[$inp]
 815         aesmc           $dat0,$dat0
 816         aesmc           $dat1,$dat1
 817         aese            $dat0,q13
 818         aese            $dat1,q13
 819         aesmc           $dat0,$dat0
 820         aesmc           $dat1,$dat1
 821         aese            $dat0,q14
 822         aese            $dat1,q14
 823          veor           $in0,$in0,$rndlast
 824         aesmc           $dat0,$dat0
 825         aesmc           $dat1,$dat1
 826          veor           $in1,$in1,$rndlast
 827         aese            $dat0,q15
 828         aese            $dat1,q15
 829
 830         cmp             $len,#1
 831         veor            $in0,$in0,$dat0
 832         veor            $in1,$in1,$dat1
 833         vst1.8          {$in0},[$out],#16
 834         b.eq            .Lctr32_done
 835         vst1.8          {$in1},[$out]
 836
 837 .Lctr32_done:
 838 ___
 839 $code.=<<___    if ($flavour !~ /64/);
 840         vldmia          sp!,{d8-d15}
 841         ldmia           sp!,{r4-r10,pc}
 842 ___
 843 $code.=<<___    if ($flavour =~ /64/);
 844         ldr             x29,[sp],#16
 845         ret
 846 ___
 847 $code.=<<___;
 848 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
 849 ___
 850 }}}
 851 $code.=<<___;
 852 #endif
 853 ___
 854 ########################################
 855 if ($flavour =~ /64/) {                 ######## 64-bit code
 856     my %opcode = (
 857         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
 858         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
 859
 860     local *unaes = sub {
 861         my ($mnemonic,$arg)=@_;
 862
 863         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
 864         sprintf ".inst\t0x%08x\t//%s %s",
 865                         $opcode{$mnemonic}|$1|($2<<5),
 866                         $mnemonic,$arg;
 867     };
 868
 869     foreach(split("\n",$code)) {
 870         s/\`([^\`]*)\`/eval($1)/geo;
 871
 872         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
 873         s/@\s/\/\//o;                   # old->new style commentary
 874
 875         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
 876         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
 877         s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
 878         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
 879         s/vext\.8/ext/o         or
 880         s/vrev32\.8/rev32/o     or
 881         s/vtst\.8/cmtst/o       or
 882         s/vshr/ushr/o           or
 883         s/^(\s+)v/$1/o          or      # strip off v prefix
 884         s/\bbx\s+lr\b/ret/o;
 885
 886         # fix up remainig legacy suffixes
 887         s/\.[ui]?8//o;
 888         m/\],#8/o and s/\.16b/\.8b/go;
 889         s/\.[ui]?32//o and s/\.16b/\.4s/go;
 890         s/\.[ui]?64//o and s/\.16b/\.2d/go;
 891         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
 892
 893         print $_,"\n";
 894     }
 895 } else {                                ######## 32-bit code
 896     my %opcode = (
 897         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
 898         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
 899
 900     local *unaes = sub {
 901         my ($mnemonic,$arg)=@_;
 902
 903         if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
 904             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
 905                                          |(($2&7)<<1) |(($2&8)<<2);
 906             # since ARMv7 instructions are always encoded little-endian.
 907             # correct solution is to use .inst directive, but older
 908             # assemblers don't implement it:-(
 909             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
 910                         $word&0xff,($word>>8)&0xff,
 911                         ($word>>16)&0xff,($word>>24)&0xff,
 912                         $mnemonic,$arg;
 913         }
 914     };
 915
 916     sub unvtbl {
 917         my $arg=shift;
 918
 919         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
 920         sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
 921                 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
 922     }
 923
 924     sub unvdup32 {
 925         my $arg=shift;
 926
 927         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
 928         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
 929     }
 930
 931     sub unvmov32 {
 932         my $arg=shift;
 933
 934         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
 935         sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
 936     }
 937
 938     foreach(split("\n",$code)) {
 939         s/\`([^\`]*)\`/eval($1)/geo;
 940
 941         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
 942         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
 943         s/\/\/\s?/@ /o;                         # new->old style commentary
 944
 945         # fix up remainig new-style suffixes
 946         s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
 947         s/\],#[0-9]+/]!/o;
 948
 949         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
 950         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
 951         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
 952         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
 953         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
 954         s/^(\s+)b\./$1b/o                               or
 955         s/^(\s+)mov\./$1mov/o                           or
 956         s/^(\s+)ret/$1bx\tlr/o;
 957
 958         print $_,"\n";
 959     }
 960 }
 961
 962 close STDOUT;