crypto/aes/asm/aesv8-armx.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2014-2022 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # This module implements support for ARMv8 AES instructions. The
  18 # module is endian-agnostic in sense that it supports both big- and
  19 # little-endian cases. As does it support both 32- and 64-bit modes
  20 # of operation. Latter is achieved by limiting amount of utilized
  21 # registers to 16, which implies additional NEON load and integer
  22 # instructions. This has no effect on mighty Apple A7, where results
  23 # are literally equal to the theoretical estimates based on AES
  24 # instruction latencies and issue rates. On Cortex-A53, an in-order
  25 # execution core, this costs up to 10-15%, which is partially
  26 # compensated by implementing dedicated code path for 128-bit
  27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
  28 # seems to be limited by sheer amount of NEON instructions...
  29 #
  30 # April 2019
  31 #
  32 # Key to performance of parallelize-able modes is round instruction
  33 # interleaving. But which factor to use? There is optimal one for
  34 # each combination of instruction latency and issue rate, beyond
  35 # which increasing interleave factor doesn't pay off. While on cons
  36 # side we have code size increase and resource waste on platforms for
  37 # which interleave factor is too high. In other words you want it to
  38 # be just right. So far interleave factor of 3x was serving well all
  39 # platforms. But for ThunderX2 optimal interleave factor was measured
  40 # to be 5x...
  41 #
  42 # Performance in cycles per byte processed with 128-bit key:
  43 #
  44 #               CBC enc         CBC dec         CTR
  45 # Apple A7      2.39            1.20            1.20
  46 # Cortex-A53    1.32            1.17/1.29(**)   1.36/1.46
  47 # Cortex-A57(*) 1.95            0.82/0.85       0.89/0.93
  48 # Cortex-A72    1.33            0.85/0.88       0.92/0.96
  49 # Denver        1.96            0.65/0.86       0.76/0.80
  50 # Mongoose      1.33            1.23/1.20       1.30/1.20
  51 # Kryo          1.26            0.87/0.94       1.00/1.00
  52 # ThunderX2     5.95            1.25            1.30
  53 #
  54 # (*)   original 3.64/1.34/1.32 results were for r0p0 revision
  55 #       and are still same even for updated module;
  56 # (**)  numbers after slash are for 32-bit code, which is 3x-
  57 #       interleaved;
  58
  59 # $output is the last argument if it looks like a file (it has an extension)
  60 # $flavour is the first argument if it doesn't look like a file
  61 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  62 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  63
  64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  65 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  66 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  67 die "can't locate arm-xlate.pl";
  68
  69 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  70     or die "can't call $xlate: $!";
  71 *STDOUT=*OUT;
  72
  73 $prefix="aes_v8";
  74
  75 $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
  76
  77 $code=<<___;
  78 #include "arm_arch.h"
  79
  80 #if __ARM_MAX_ARCH__>=7
  81 ___
  82 $code.=".arch   armv8-a+crypto\n.text\n"                if ($flavour =~ /64/);
  83 $code.=<<___                                            if ($flavour !~ /64/);
  84 .arch   armv7-a // don't confuse not-so-latest binutils with argv8 :-)
  85 .fpu    neon
  86 #ifdef  __thumb2__
  87 .syntax unified
  88 .thumb
  89 # define INST(a,b,c,d)  $_byte  c,d|0xc,a,b
  90 #else
  91 .code   32
  92 # define INST(a,b,c,d)  $_byte  a,b,c,d
  93 #endif
  94
  95 .text
  96 ___
  97
  98 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
  99 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
 100 # maintain both 32- and 64-bit codes within single module and
 101 # transliterate common code to either flavour with regex vodoo.
 102 #
 103 {{{
 104 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
 105 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
 106         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
 107
 108
 109 $code.=<<___;
 110 .align  5
 111 .Lrcon:
 112 .long   0x01,0x01,0x01,0x01
 113 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
 114 .long   0x1b,0x1b,0x1b,0x1b
 115
 116 .globl  ${prefix}_set_encrypt_key
 117 .type   ${prefix}_set_encrypt_key,%function
 118 .align  5
 119 ${prefix}_set_encrypt_key:
 120 .Lenc_key:
 121 ___
 122 $code.=<<___    if ($flavour =~ /64/);
 123         AARCH64_VALID_CALL_TARGET
 124         // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
 125         stp     x29,x30,[sp,#-16]!
 126         add     x29,sp,#0
 127 ___
 128 $code.=<<___;
 129         mov     $ptr,#-1
 130         cmp     $inp,#0
 131         b.eq    .Lenc_key_abort
 132         cmp     $out,#0
 133         b.eq    .Lenc_key_abort
 134         mov     $ptr,#-2
 135         cmp     $bits,#128
 136         b.lt    .Lenc_key_abort
 137         cmp     $bits,#256
 138         b.gt    .Lenc_key_abort
 139         tst     $bits,#0x3f
 140         b.ne    .Lenc_key_abort
 141
 142         adr     $ptr,.Lrcon
 143         cmp     $bits,#192
 144
 145         veor    $zero,$zero,$zero
 146         vld1.8  {$in0},[$inp],#16
 147         mov     $bits,#8                // reuse $bits
 148         vld1.32 {$rcon,$mask},[$ptr],#32
 149
 150         b.lt    .Loop128
 151         b.eq    .L192
 152         b       .L256
 153
 154 .align  4
 155 .Loop128:
 156         vtbl.8  $key,{$in0},$mask
 157         vext.8  $tmp,$zero,$in0,#12
 158         vst1.32 {$in0},[$out],#16
 159         aese    $key,$zero
 160         subs    $bits,$bits,#1
 161
 162         veor    $in0,$in0,$tmp
 163         vext.8  $tmp,$zero,$tmp,#12
 164         veor    $in0,$in0,$tmp
 165         vext.8  $tmp,$zero,$tmp,#12
 166          veor   $key,$key,$rcon
 167         veor    $in0,$in0,$tmp
 168         vshl.u8 $rcon,$rcon,#1
 169         veor    $in0,$in0,$key
 170         b.ne    .Loop128
 171
 172         vld1.32 {$rcon},[$ptr]
 173
 174         vtbl.8  $key,{$in0},$mask
 175         vext.8  $tmp,$zero,$in0,#12
 176         vst1.32 {$in0},[$out],#16
 177         aese    $key,$zero
 178
 179         veor    $in0,$in0,$tmp
 180         vext.8  $tmp,$zero,$tmp,#12
 181         veor    $in0,$in0,$tmp
 182         vext.8  $tmp,$zero,$tmp,#12
 183          veor   $key,$key,$rcon
 184         veor    $in0,$in0,$tmp
 185         vshl.u8 $rcon,$rcon,#1
 186         veor    $in0,$in0,$key
 187
 188         vtbl.8  $key,{$in0},$mask
 189         vext.8  $tmp,$zero,$in0,#12
 190         vst1.32 {$in0},[$out],#16
 191         aese    $key,$zero
 192
 193         veor    $in0,$in0,$tmp
 194         vext.8  $tmp,$zero,$tmp,#12
 195         veor    $in0,$in0,$tmp
 196         vext.8  $tmp,$zero,$tmp,#12
 197          veor   $key,$key,$rcon
 198         veor    $in0,$in0,$tmp
 199         veor    $in0,$in0,$key
 200         vst1.32 {$in0},[$out]
 201         add     $out,$out,#0x50
 202
 203         mov     $rounds,#10
 204         b       .Ldone
 205
 206 .align  4
 207 .L192:
 208         vld1.8  {$in1},[$inp],#8
 209         vmov.i8 $key,#8                 // borrow $key
 210         vst1.32 {$in0},[$out],#16
 211         vsub.i8 $mask,$mask,$key        // adjust the mask
 212
 213 .Loop192:
 214         vtbl.8  $key,{$in1},$mask
 215         vext.8  $tmp,$zero,$in0,#12
 216 #ifdef __ARMEB__
 217         vst1.32 {$in1},[$out],#16
 218         sub     $out,$out,#8
 219 #else
 220         vst1.32 {$in1},[$out],#8
 221 #endif
 222         aese    $key,$zero
 223         subs    $bits,$bits,#1
 224
 225         veor    $in0,$in0,$tmp
 226         vext.8  $tmp,$zero,$tmp,#12
 227         veor    $in0,$in0,$tmp
 228         vext.8  $tmp,$zero,$tmp,#12
 229         veor    $in0,$in0,$tmp
 230
 231         vdup.32 $tmp,${in0}[3]
 232         veor    $tmp,$tmp,$in1
 233          veor   $key,$key,$rcon
 234         vext.8  $in1,$zero,$in1,#12
 235         vshl.u8 $rcon,$rcon,#1
 236         veor    $in1,$in1,$tmp
 237         veor    $in0,$in0,$key
 238         veor    $in1,$in1,$key
 239         vst1.32 {$in0},[$out],#16
 240         b.ne    .Loop192
 241
 242         mov     $rounds,#12
 243         add     $out,$out,#0x20
 244         b       .Ldone
 245
 246 .align  4
 247 .L256:
 248         vld1.8  {$in1},[$inp]
 249         mov     $bits,#7
 250         mov     $rounds,#14
 251         vst1.32 {$in0},[$out],#16
 252
 253 .Loop256:
 254         vtbl.8  $key,{$in1},$mask
 255         vext.8  $tmp,$zero,$in0,#12
 256         vst1.32 {$in1},[$out],#16
 257         aese    $key,$zero
 258         subs    $bits,$bits,#1
 259
 260         veor    $in0,$in0,$tmp
 261         vext.8  $tmp,$zero,$tmp,#12
 262         veor    $in0,$in0,$tmp
 263         vext.8  $tmp,$zero,$tmp,#12
 264          veor   $key,$key,$rcon
 265         veor    $in0,$in0,$tmp
 266         vshl.u8 $rcon,$rcon,#1
 267         veor    $in0,$in0,$key
 268         vst1.32 {$in0},[$out],#16
 269         b.eq    .Ldone
 270
 271         vdup.32 $key,${in0}[3]          // just splat
 272         vext.8  $tmp,$zero,$in1,#12
 273         aese    $key,$zero
 274
 275         veor    $in1,$in1,$tmp
 276         vext.8  $tmp,$zero,$tmp,#12
 277         veor    $in1,$in1,$tmp
 278         vext.8  $tmp,$zero,$tmp,#12
 279         veor    $in1,$in1,$tmp
 280
 281         veor    $in1,$in1,$key
 282         b       .Loop256
 283
 284 .Ldone:
 285         str     $rounds,[$out]
 286         mov     $ptr,#0
 287
 288 .Lenc_key_abort:
 289         mov     x0,$ptr                 // return value
 290         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
 291         ret
 292 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
 293
 294 .globl  ${prefix}_set_decrypt_key
 295 .type   ${prefix}_set_decrypt_key,%function
 296 .align  5
 297 ${prefix}_set_decrypt_key:
 298 ___
 299 $code.=<<___    if ($flavour =~ /64/);
 300         AARCH64_SIGN_LINK_REGISTER
 301         stp     x29,x30,[sp,#-16]!
 302         add     x29,sp,#0
 303 ___
 304 $code.=<<___    if ($flavour !~ /64/);
 305         stmdb   sp!,{r4,lr}
 306 ___
 307 $code.=<<___;
 308         bl      .Lenc_key
 309
 310         cmp     x0,#0
 311         b.ne    .Ldec_key_abort
 312
 313         sub     $out,$out,#240          // restore original $out
 314         mov     x4,#-16
 315         add     $inp,$out,x12,lsl#4     // end of key schedule
 316
 317         vld1.32 {v0.16b},[$out]
 318         vld1.32 {v1.16b},[$inp]
 319         vst1.32 {v0.16b},[$inp],x4
 320         vst1.32 {v1.16b},[$out],#16
 321
 322 .Loop_imc:
 323         vld1.32 {v0.16b},[$out]
 324         vld1.32 {v1.16b},[$inp]
 325         aesimc  v0.16b,v0.16b
 326         aesimc  v1.16b,v1.16b
 327         vst1.32 {v0.16b},[$inp],x4
 328         vst1.32 {v1.16b},[$out],#16
 329         cmp     $inp,$out
 330         b.hi    .Loop_imc
 331
 332         vld1.32 {v0.16b},[$out]
 333         aesimc  v0.16b,v0.16b
 334         vst1.32 {v0.16b},[$inp]
 335
 336         eor     x0,x0,x0                // return value
 337 .Ldec_key_abort:
 338 ___
 339 $code.=<<___    if ($flavour !~ /64/);
 340         ldmia   sp!,{r4,pc}
 341 ___
 342 $code.=<<___    if ($flavour =~ /64/);
 343         ldp     x29,x30,[sp],#16
 344         AARCH64_VALIDATE_LINK_REGISTER
 345         ret
 346 ___
 347 $code.=<<___;
 348 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
 349 ___
 350 }}}
 351 {{{
 352 sub gen_block () {
 353 my $dir = shift;
 354 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
 355 my ($inp,$out,$key)=map("x$_",(0..2));
 356 my $rounds="w3";
 357 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
 358
 359 $code.=<<___;
 360 .globl  ${prefix}_${dir}crypt
 361 .type   ${prefix}_${dir}crypt,%function
 362 .align  5
 363 ${prefix}_${dir}crypt:
 364 ___
 365 $code.=<<___    if ($flavour =~ /64/);
 366         AARCH64_VALID_CALL_TARGET
 367 ___
 368 $code.=<<___;
 369         ldr     $rounds,[$key,#240]
 370         vld1.32 {$rndkey0},[$key],#16
 371         vld1.8  {$inout},[$inp]
 372         sub     $rounds,$rounds,#2
 373         vld1.32 {$rndkey1},[$key],#16
 374
 375 .Loop_${dir}c:
 376         aes$e   $inout,$rndkey0
 377         aes$mc  $inout,$inout
 378         vld1.32 {$rndkey0},[$key],#16
 379         subs    $rounds,$rounds,#2
 380         aes$e   $inout,$rndkey1
 381         aes$mc  $inout,$inout
 382         vld1.32 {$rndkey1},[$key],#16
 383         b.gt    .Loop_${dir}c
 384
 385         aes$e   $inout,$rndkey0
 386         aes$mc  $inout,$inout
 387         vld1.32 {$rndkey0},[$key]
 388         aes$e   $inout,$rndkey1
 389         veor    $inout,$inout,$rndkey0
 390
 391         vst1.8  {$inout},[$out]
 392         ret
 393 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
 394 ___
 395 }
 396 &gen_block("en");
 397 &gen_block("de");
 398 }}}
 399
 400 # Performance in cycles per byte.
 401 # Processed with AES-ECB different key size.
 402 # It shows the value before and after optimization as below:
 403 # (before/after):
 404 #
 405 #               AES-128-ECB             AES-192-ECB             AES-256-ECB
 406 # Cortex-A57    1.85/0.82               2.16/0.96               2.47/1.10
 407 # Cortex-A72    1.64/0.85               1.82/0.99               2.13/1.14
 408
 409 # Optimization is implemented by loop unrolling and interleaving.
 410 # Commonly, we choose the unrolling factor as 5, if the input
 411 # data size smaller than 5 blocks, but not smaller than 3 blocks,
 412 # choose 3 as the unrolling factor.
 413 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
 414 # as one iteration, every loop the left size lsize -= 5*16.
 415 # If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
 416 # every loop lsize -=3*16.
 417 # If lsize < 3*16 bytes, treat them as the tail, interleave the
 418 # two blocks AES instructions.
 419 # There is one special case, if the original input data size dsize
 420 # = 16 bytes, we will treat it separately to improve the
 421 # performance: one independent code block without LR, FP load and
 422 # store, just looks like what the original ECB implementation does.
 423
 424 {{{
 425 my ($inp,$out,$len,$key)=map("x$_",(0..3));
 426 my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
 427 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
 428
 429 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
 430
 431 ### q7  last round key
 432 ### q10-q15     q7 Last 7 round keys
 433 ### q8-q9       preloaded round keys except last 7 keys for big size
 434 ### q5, q6, q8-q9       preloaded round keys except last 7 keys for only 16 byte
 435
 436 {
 437 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
 438
 439 my ($dat3,$in3,$tmp3);  # used only in 64-bit mode
 440 my ($dat4,$in4,$tmp4);
 441 if ($flavour =~ /64/) {
 442     ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
 443 }
 444
 445 $code.=<<___;
 446 .globl  ${prefix}_ecb_encrypt
 447 .type   ${prefix}_ecb_encrypt,%function
 448 .align  5
 449 ${prefix}_ecb_encrypt:
 450 ___
 451 $code.=<<___    if ($flavour =~ /64/);
 452         AARCH64_VALID_CALL_TARGET
 453         subs    $len,$len,#16
 454         // Original input data size bigger than 16, jump to big size processing.
 455         b.ne    .Lecb_big_size
 456         vld1.8  {$dat0},[$inp]
 457         cmp     $enc,#0                                 // en- or decrypting?
 458         ldr     $rounds,[$key,#240]
 459         vld1.32 {q5-q6},[$key],#32                      // load key schedule...
 460
 461         b.eq .Lecb_small_dec
 462         aese    $dat0,q5
 463         aesmc   $dat0,$dat0
 464         vld1.32 {q8-q9},[$key],#32                      // load key schedule...
 465         aese    $dat0,q6
 466         aesmc   $dat0,$dat0
 467         subs    $rounds,$rounds,#10                     // if rounds==10, jump to aes-128-ecb processing
 468         b.eq    .Lecb_128_enc
 469 .Lecb_round_loop:
 470         aese    $dat0,q8
 471         aesmc   $dat0,$dat0
 472         vld1.32 {q8},[$key],#16                         // load key schedule...
 473         aese    $dat0,q9
 474         aesmc   $dat0,$dat0
 475         vld1.32 {q9},[$key],#16                         // load key schedule...
 476         subs    $rounds,$rounds,#2                      // bias
 477         b.gt    .Lecb_round_loop
 478 .Lecb_128_enc:
 479         vld1.32 {q10-q11},[$key],#32            // load key schedule...
 480         aese    $dat0,q8
 481         aesmc   $dat0,$dat0
 482         aese    $dat0,q9
 483         aesmc   $dat0,$dat0
 484         vld1.32 {q12-q13},[$key],#32            // load key schedule...
 485         aese    $dat0,q10
 486         aesmc   $dat0,$dat0
 487         aese    $dat0,q11
 488         aesmc   $dat0,$dat0
 489         vld1.32 {q14-q15},[$key],#32            // load key schedule...
 490         aese    $dat0,q12
 491         aesmc   $dat0,$dat0
 492         aese    $dat0,q13
 493         aesmc   $dat0,$dat0
 494         vld1.32 {$rndlast},[$key]
 495         aese    $dat0,q14
 496         aesmc   $dat0,$dat0
 497         aese    $dat0,q15
 498         veor    $dat0,$dat0,$rndlast
 499         vst1.8  {$dat0},[$out]
 500         b       .Lecb_Final_abort
 501 .Lecb_small_dec:
 502         aesd    $dat0,q5
 503         aesimc  $dat0,$dat0
 504         vld1.32 {q8-q9},[$key],#32                      // load key schedule...
 505         aesd    $dat0,q6
 506         aesimc  $dat0,$dat0
 507         subs    $rounds,$rounds,#10                     // bias
 508         b.eq    .Lecb_128_dec
 509 .Lecb_dec_round_loop:
 510         aesd    $dat0,q8
 511         aesimc  $dat0,$dat0
 512         vld1.32 {q8},[$key],#16                         // load key schedule...
 513         aesd    $dat0,q9
 514         aesimc  $dat0,$dat0
 515         vld1.32 {q9},[$key],#16                         // load key schedule...
 516         subs    $rounds,$rounds,#2                      // bias
 517         b.gt    .Lecb_dec_round_loop
 518 .Lecb_128_dec:
 519         vld1.32 {q10-q11},[$key],#32            // load key schedule...
 520         aesd    $dat0,q8
 521         aesimc  $dat0,$dat0
 522         aesd    $dat0,q9
 523         aesimc  $dat0,$dat0
 524         vld1.32 {q12-q13},[$key],#32            // load key schedule...
 525         aesd    $dat0,q10
 526         aesimc  $dat0,$dat0
 527         aesd    $dat0,q11
 528         aesimc  $dat0,$dat0
 529         vld1.32 {q14-q15},[$key],#32            // load key schedule...
 530         aesd    $dat0,q12
 531         aesimc  $dat0,$dat0
 532         aesd    $dat0,q13
 533         aesimc  $dat0,$dat0
 534         vld1.32 {$rndlast},[$key]
 535         aesd    $dat0,q14
 536         aesimc  $dat0,$dat0
 537         aesd    $dat0,q15
 538         veor    $dat0,$dat0,$rndlast
 539         vst1.8  {$dat0},[$out]
 540         b       .Lecb_Final_abort
 541 .Lecb_big_size:
 542 ___
 543 $code.=<<___    if ($flavour =~ /64/);
 544         stp     x29,x30,[sp,#-16]!
 545         add     x29,sp,#0
 546 ___
 547 $code.=<<___    if ($flavour !~ /64/);
 548         mov     ip,sp
 549         stmdb   sp!,{r4-r8,lr}
 550         vstmdb  sp!,{d8-d15}                    @ ABI specification says so
 551         ldmia   ip,{r4-r5}                      @ load remaining args
 552         subs    $len,$len,#16
 553 ___
 554 $code.=<<___;
 555         mov     $step,#16
 556         b.lo    .Lecb_done
 557         cclr    $step,eq
 558
 559         cmp     $enc,#0                                 // en- or decrypting?
 560         ldr     $rounds,[$key,#240]
 561         and     $len,$len,#-16
 562         vld1.8  {$dat},[$inp],$step
 563
 564         vld1.32 {q8-q9},[$key]                          // load key schedule...
 565         sub     $rounds,$rounds,#6
 566         add     $key_,$key,x5,lsl#4                             // pointer to last 7 round keys
 567         sub     $rounds,$rounds,#2
 568         vld1.32 {q10-q11},[$key_],#32
 569         vld1.32 {q12-q13},[$key_],#32
 570         vld1.32 {q14-q15},[$key_],#32
 571         vld1.32 {$rndlast},[$key_]
 572
 573         add     $key_,$key,#32
 574         mov     $cnt,$rounds
 575         b.eq    .Lecb_dec
 576
 577         vld1.8  {$dat1},[$inp],#16
 578         subs    $len,$len,#32                           // bias
 579         add     $cnt,$rounds,#2
 580         vorr    $in1,$dat1,$dat1
 581         vorr    $dat2,$dat1,$dat1
 582         vorr    $dat1,$dat,$dat
 583         b.lo    .Lecb_enc_tail
 584
 585         vorr    $dat1,$in1,$in1
 586         vld1.8  {$dat2},[$inp],#16
 587 ___
 588 $code.=<<___    if ($flavour =~ /64/);
 589         cmp     $len,#32
 590         b.lo    .Loop3x_ecb_enc
 591
 592         vld1.8  {$dat3},[$inp],#16
 593         vld1.8  {$dat4},[$inp],#16
 594         sub     $len,$len,#32                           // bias
 595         mov     $cnt,$rounds
 596
 597 .Loop5x_ecb_enc:
 598         aese    $dat0,q8
 599         aesmc   $dat0,$dat0
 600         aese    $dat1,q8
 601         aesmc   $dat1,$dat1
 602         aese    $dat2,q8
 603         aesmc   $dat2,$dat2
 604         aese    $dat3,q8
 605         aesmc   $dat3,$dat3
 606         aese    $dat4,q8
 607         aesmc   $dat4,$dat4
 608         vld1.32 {q8},[$key_],#16
 609         subs    $cnt,$cnt,#2
 610         aese    $dat0,q9
 611         aesmc   $dat0,$dat0
 612         aese    $dat1,q9
 613         aesmc   $dat1,$dat1
 614         aese    $dat2,q9
 615         aesmc   $dat2,$dat2
 616         aese    $dat3,q9
 617         aesmc   $dat3,$dat3
 618         aese    $dat4,q9
 619         aesmc   $dat4,$dat4
 620         vld1.32 {q9},[$key_],#16
 621         b.gt    .Loop5x_ecb_enc
 622
 623         aese    $dat0,q8
 624         aesmc   $dat0,$dat0
 625         aese    $dat1,q8
 626         aesmc   $dat1,$dat1
 627         aese    $dat2,q8
 628         aesmc   $dat2,$dat2
 629         aese    $dat3,q8
 630         aesmc   $dat3,$dat3
 631         aese    $dat4,q8
 632         aesmc   $dat4,$dat4
 633         cmp     $len,#0x40                                      // because .Lecb_enc_tail4x
 634         sub     $len,$len,#0x50
 635
 636         aese    $dat0,q9
 637         aesmc   $dat0,$dat0
 638         aese    $dat1,q9
 639         aesmc   $dat1,$dat1
 640         aese    $dat2,q9
 641         aesmc   $dat2,$dat2
 642         aese    $dat3,q9
 643         aesmc   $dat3,$dat3
 644         aese    $dat4,q9
 645         aesmc   $dat4,$dat4
 646         csel    x6,xzr,$len,gt                  // borrow x6, $cnt, "gt" is not typo
 647         mov     $key_,$key
 648
 649         aese    $dat0,q10
 650         aesmc   $dat0,$dat0
 651         aese    $dat1,q10
 652         aesmc   $dat1,$dat1
 653         aese    $dat2,q10
 654         aesmc   $dat2,$dat2
 655         aese    $dat3,q10
 656         aesmc   $dat3,$dat3
 657         aese    $dat4,q10
 658         aesmc   $dat4,$dat4
 659         add     $inp,$inp,x6                            // $inp is adjusted in such way that
 660                                                         // at exit from the loop $dat1-$dat4
 661                                                         // are loaded with last "words"
 662         add     x6,$len,#0x60               // because .Lecb_enc_tail4x
 663
 664         aese    $dat0,q11
 665         aesmc   $dat0,$dat0
 666         aese    $dat1,q11
 667         aesmc   $dat1,$dat1
 668         aese    $dat2,q11
 669         aesmc   $dat2,$dat2
 670         aese    $dat3,q11
 671         aesmc   $dat3,$dat3
 672         aese    $dat4,q11
 673         aesmc   $dat4,$dat4
 674
 675         aese    $dat0,q12
 676         aesmc   $dat0,$dat0
 677         aese    $dat1,q12
 678         aesmc   $dat1,$dat1
 679         aese    $dat2,q12
 680         aesmc   $dat2,$dat2
 681         aese    $dat3,q12
 682         aesmc   $dat3,$dat3
 683         aese    $dat4,q12
 684         aesmc   $dat4,$dat4
 685
 686         aese    $dat0,q13
 687         aesmc   $dat0,$dat0
 688         aese    $dat1,q13
 689         aesmc   $dat1,$dat1
 690         aese    $dat2,q13
 691         aesmc   $dat2,$dat2
 692         aese    $dat3,q13
 693         aesmc   $dat3,$dat3
 694         aese    $dat4,q13
 695         aesmc   $dat4,$dat4
 696
 697         aese    $dat0,q14
 698         aesmc   $dat0,$dat0
 699         aese    $dat1,q14
 700         aesmc   $dat1,$dat1
 701         aese    $dat2,q14
 702         aesmc   $dat2,$dat2
 703         aese    $dat3,q14
 704         aesmc   $dat3,$dat3
 705         aese    $dat4,q14
 706         aesmc   $dat4,$dat4
 707
 708         aese    $dat0,q15
 709         vld1.8  {$in0},[$inp],#16
 710         aese    $dat1,q15
 711         vld1.8  {$in1},[$inp],#16
 712         aese    $dat2,q15
 713         vld1.8  {$in2},[$inp],#16
 714         aese    $dat3,q15
 715         vld1.8  {$in3},[$inp],#16
 716         aese    $dat4,q15
 717         vld1.8  {$in4},[$inp],#16
 718         cbz     x6,.Lecb_enc_tail4x
 719         vld1.32 {q8},[$key_],#16                        // re-pre-load rndkey[0]
 720         veor    $tmp0,$rndlast,$dat0
 721         vorr    $dat0,$in0,$in0
 722         veor    $tmp1,$rndlast,$dat1
 723         vorr    $dat1,$in1,$in1
 724         veor    $tmp2,$rndlast,$dat2
 725         vorr    $dat2,$in2,$in2
 726         veor    $tmp3,$rndlast,$dat3
 727         vorr    $dat3,$in3,$in3
 728         veor    $tmp4,$rndlast,$dat4
 729         vst1.8  {$tmp0},[$out],#16
 730         vorr    $dat4,$in4,$in4
 731         vst1.8  {$tmp1},[$out],#16
 732         mov     $cnt,$rounds
 733         vst1.8  {$tmp2},[$out],#16
 734         vld1.32 {q9},[$key_],#16                        // re-pre-load rndkey[1]
 735         vst1.8  {$tmp3},[$out],#16
 736         vst1.8  {$tmp4},[$out],#16
 737         b.hs    .Loop5x_ecb_enc
 738
 739         add     $len,$len,#0x50
 740         cbz     $len,.Lecb_done
 741
 742         add     $cnt,$rounds,#2
 743         subs    $len,$len,#0x30
 744         vorr    $dat0,$in2,$in2
 745         vorr    $dat1,$in3,$in3
 746         vorr    $dat2,$in4,$in4
 747         b.lo    .Lecb_enc_tail
 748
 749         b       .Loop3x_ecb_enc
 750
 751 .align  4
 752 .Lecb_enc_tail4x:
 753         veor    $tmp1,$rndlast,$dat1
 754         veor    $tmp2,$rndlast,$dat2
 755         veor    $tmp3,$rndlast,$dat3
 756         veor    $tmp4,$rndlast,$dat4
 757         vst1.8  {$tmp1},[$out],#16
 758         vst1.8  {$tmp2},[$out],#16
 759         vst1.8  {$tmp3},[$out],#16
 760         vst1.8  {$tmp4},[$out],#16
 761
 762         b       .Lecb_done
 763 .align  4
 764 ___
 765 $code.=<<___;
 766 .Loop3x_ecb_enc:
 767         aese    $dat0,q8
 768         aesmc   $dat0,$dat0
 769         aese    $dat1,q8
 770         aesmc   $dat1,$dat1
 771         aese    $dat2,q8
 772         aesmc   $dat2,$dat2
 773         vld1.32 {q8},[$key_],#16
 774         subs    $cnt,$cnt,#2
 775         aese    $dat0,q9
 776         aesmc   $dat0,$dat0
 777         aese    $dat1,q9
 778         aesmc   $dat1,$dat1
 779         aese    $dat2,q9
 780         aesmc   $dat2,$dat2
 781         vld1.32 {q9},[$key_],#16
 782         b.gt    .Loop3x_ecb_enc
 783
 784         aese    $dat0,q8
 785         aesmc   $dat0,$dat0
 786         aese    $dat1,q8
 787         aesmc   $dat1,$dat1
 788         aese    $dat2,q8
 789         aesmc   $dat2,$dat2
 790         subs    $len,$len,#0x30
 791         mov.lo  x6,$len                         // x6, $cnt, is zero at this point
 792         aese    $dat0,q9
 793         aesmc   $dat0,$dat0
 794         aese    $dat1,q9
 795         aesmc   $dat1,$dat1
 796         aese    $dat2,q9
 797         aesmc   $dat2,$dat2
 798         add     $inp,$inp,x6                    // $inp is adjusted in such way that
 799                                                 // at exit from the loop $dat1-$dat2
 800                                                 // are loaded with last "words"
 801         mov     $key_,$key
 802         aese    $dat0,q12
 803         aesmc   $dat0,$dat0
 804         aese    $dat1,q12
 805         aesmc   $dat1,$dat1
 806         aese    $dat2,q12
 807         aesmc   $dat2,$dat2
 808         vld1.8  {$in0},[$inp],#16
 809         aese    $dat0,q13
 810         aesmc   $dat0,$dat0
 811         aese    $dat1,q13
 812         aesmc   $dat1,$dat1
 813         aese    $dat2,q13
 814         aesmc   $dat2,$dat2
 815         vld1.8  {$in1},[$inp],#16
 816         aese    $dat0,q14
 817         aesmc   $dat0,$dat0
 818         aese    $dat1,q14
 819         aesmc   $dat1,$dat1
 820         aese    $dat2,q14
 821         aesmc   $dat2,$dat2
 822         vld1.8  {$in2},[$inp],#16
 823         aese    $dat0,q15
 824         aese    $dat1,q15
 825         aese    $dat2,q15
 826         vld1.32 {q8},[$key_],#16                // re-pre-load rndkey[0]
 827         add     $cnt,$rounds,#2
 828         veor    $tmp0,$rndlast,$dat0
 829         veor    $tmp1,$rndlast,$dat1
 830         veor    $dat2,$dat2,$rndlast
 831         vld1.32 {q9},[$key_],#16                // re-pre-load rndkey[1]
 832         vst1.8  {$tmp0},[$out],#16
 833         vorr    $dat0,$in0,$in0
 834         vst1.8  {$tmp1},[$out],#16
 835         vorr    $dat1,$in1,$in1
 836         vst1.8  {$dat2},[$out],#16
 837         vorr    $dat2,$in2,$in2
 838         b.hs    .Loop3x_ecb_enc
 839
 840         cmn     $len,#0x30
 841         b.eq    .Lecb_done
 842         nop
 843
 844 .Lecb_enc_tail:
 845         aese    $dat1,q8
 846         aesmc   $dat1,$dat1
 847         aese    $dat2,q8
 848         aesmc   $dat2,$dat2
 849         vld1.32 {q8},[$key_],#16
 850         subs    $cnt,$cnt,#2
 851         aese    $dat1,q9
 852         aesmc   $dat1,$dat1
 853         aese    $dat2,q9
 854         aesmc   $dat2,$dat2
 855         vld1.32 {q9},[$key_],#16
 856         b.gt    .Lecb_enc_tail
 857
 858         aese    $dat1,q8
 859         aesmc   $dat1,$dat1
 860         aese    $dat2,q8
 861         aesmc   $dat2,$dat2
 862         aese    $dat1,q9
 863         aesmc   $dat1,$dat1
 864         aese    $dat2,q9
 865         aesmc   $dat2,$dat2
 866         aese    $dat1,q12
 867         aesmc   $dat1,$dat1
 868         aese    $dat2,q12
 869         aesmc   $dat2,$dat2
 870         cmn     $len,#0x20
 871         aese    $dat1,q13
 872         aesmc   $dat1,$dat1
 873         aese    $dat2,q13
 874         aesmc   $dat2,$dat2
 875         aese    $dat1,q14
 876         aesmc   $dat1,$dat1
 877         aese    $dat2,q14
 878         aesmc   $dat2,$dat2
 879         aese    $dat1,q15
 880         aese    $dat2,q15
 881         b.eq    .Lecb_enc_one
 882         veor    $tmp1,$rndlast,$dat1
 883         veor    $tmp2,$rndlast,$dat2
 884         vst1.8  {$tmp1},[$out],#16
 885         vst1.8  {$tmp2},[$out],#16
 886         b       .Lecb_done
 887
 888 .Lecb_enc_one:
 889         veor    $tmp1,$rndlast,$dat2
 890         vst1.8  {$tmp1},[$out],#16
 891         b       .Lecb_done
 892 ___
 893
 894 $code.=<<___;
 895 .align  5
 896 .Lecb_dec:
 897         vld1.8  {$dat1},[$inp],#16
 898         subs    $len,$len,#32                   // bias
 899         add     $cnt,$rounds,#2
 900         vorr    $in1,$dat1,$dat1
 901         vorr    $dat2,$dat1,$dat1
 902         vorr    $dat1,$dat,$dat
 903         b.lo    .Lecb_dec_tail
 904
 905         vorr    $dat1,$in1,$in1
 906         vld1.8  {$dat2},[$inp],#16
 907 ___
 908 $code.=<<___    if ($flavour =~ /64/);
 909         cmp     $len,#32
 910         b.lo    .Loop3x_ecb_dec
 911
 912         vld1.8  {$dat3},[$inp],#16
 913         vld1.8  {$dat4},[$inp],#16
 914         sub     $len,$len,#32                           // bias
 915         mov     $cnt,$rounds
 916
 917 .Loop5x_ecb_dec:
 918         aesd    $dat0,q8
 919         aesimc  $dat0,$dat0
 920         aesd    $dat1,q8
 921         aesimc  $dat1,$dat1
 922         aesd    $dat2,q8
 923         aesimc  $dat2,$dat2
 924         aesd    $dat3,q8
 925         aesimc  $dat3,$dat3
 926         aesd    $dat4,q8
 927         aesimc  $dat4,$dat4
 928         vld1.32 {q8},[$key_],#16
 929         subs    $cnt,$cnt,#2
 930         aesd    $dat0,q9
 931         aesimc  $dat0,$dat0
 932         aesd    $dat1,q9
 933         aesimc  $dat1,$dat1
 934         aesd    $dat2,q9
 935         aesimc  $dat2,$dat2
 936         aesd    $dat3,q9
 937         aesimc  $dat3,$dat3
 938         aesd    $dat4,q9
 939         aesimc  $dat4,$dat4
 940         vld1.32 {q9},[$key_],#16
 941         b.gt    .Loop5x_ecb_dec
 942
 943         aesd    $dat0,q8
 944         aesimc  $dat0,$dat0
 945         aesd    $dat1,q8
 946         aesimc  $dat1,$dat1
 947         aesd    $dat2,q8
 948         aesimc  $dat2,$dat2
 949         aesd    $dat3,q8
 950         aesimc  $dat3,$dat3
 951         aesd    $dat4,q8
 952         aesimc  $dat4,$dat4
 953         cmp     $len,#0x40                              // because .Lecb_tail4x
 954         sub     $len,$len,#0x50
 955
 956         aesd    $dat0,q9
 957         aesimc  $dat0,$dat0
 958         aesd    $dat1,q9
 959         aesimc  $dat1,$dat1
 960         aesd    $dat2,q9
 961         aesimc  $dat2,$dat2
 962         aesd    $dat3,q9
 963         aesimc  $dat3,$dat3
 964         aesd    $dat4,q9
 965         aesimc  $dat4,$dat4
 966         csel    x6,xzr,$len,gt          // borrow x6, $cnt, "gt" is not typo
 967         mov     $key_,$key
 968
 969         aesd    $dat0,q10
 970         aesimc  $dat0,$dat0
 971         aesd    $dat1,q10
 972         aesimc  $dat1,$dat1
 973         aesd    $dat2,q10
 974         aesimc  $dat2,$dat2
 975         aesd    $dat3,q10
 976         aesimc  $dat3,$dat3
 977         aesd    $dat4,q10
 978         aesimc  $dat4,$dat4
 979         add     $inp,$inp,x6                            // $inp is adjusted in such way that
 980                                                         // at exit from the loop $dat1-$dat4
 981                                                         // are loaded with last "words"
 982         add     x6,$len,#0x60                   // because .Lecb_tail4x
 983
 984         aesd    $dat0,q11
 985         aesimc  $dat0,$dat0
 986         aesd    $dat1,q11
 987         aesimc  $dat1,$dat1
 988         aesd    $dat2,q11
 989         aesimc  $dat2,$dat2
 990         aesd    $dat3,q11
 991         aesimc  $dat3,$dat3
 992         aesd    $dat4,q11
 993         aesimc  $dat4,$dat4
 994
 995         aesd    $dat0,q12
 996         aesimc  $dat0,$dat0
 997         aesd    $dat1,q12
 998         aesimc  $dat1,$dat1
 999         aesd    $dat2,q12
1000         aesimc  $dat2,$dat2
1001         aesd    $dat3,q12
1002         aesimc  $dat3,$dat3
1003         aesd    $dat4,q12
1004         aesimc  $dat4,$dat4
1005
1006         aesd    $dat0,q13
1007         aesimc  $dat0,$dat0
1008         aesd    $dat1,q13
1009         aesimc  $dat1,$dat1
1010         aesd    $dat2,q13
1011         aesimc  $dat2,$dat2
1012         aesd    $dat3,q13
1013         aesimc  $dat3,$dat3
1014         aesd    $dat4,q13
1015         aesimc  $dat4,$dat4
1016
1017         aesd    $dat0,q14
1018         aesimc  $dat0,$dat0
1019         aesd    $dat1,q14
1020         aesimc  $dat1,$dat1
1021         aesd    $dat2,q14
1022         aesimc  $dat2,$dat2
1023         aesd    $dat3,q14
1024         aesimc  $dat3,$dat3
1025         aesd    $dat4,q14
1026         aesimc  $dat4,$dat4
1027
1028         aesd    $dat0,q15
1029         vld1.8  {$in0},[$inp],#16
1030         aesd    $dat1,q15
1031         vld1.8  {$in1},[$inp],#16
1032         aesd    $dat2,q15
1033         vld1.8  {$in2},[$inp],#16
1034         aesd    $dat3,q15
1035         vld1.8  {$in3},[$inp],#16
1036         aesd    $dat4,q15
1037         vld1.8  {$in4},[$inp],#16
1038         cbz     x6,.Lecb_tail4x
1039         vld1.32 {q8},[$key_],#16                        // re-pre-load rndkey[0]
1040         veor    $tmp0,$rndlast,$dat0
1041         vorr    $dat0,$in0,$in0
1042         veor    $tmp1,$rndlast,$dat1
1043         vorr    $dat1,$in1,$in1
1044         veor    $tmp2,$rndlast,$dat2
1045         vorr    $dat2,$in2,$in2
1046         veor    $tmp3,$rndlast,$dat3
1047         vorr    $dat3,$in3,$in3
1048         veor    $tmp4,$rndlast,$dat4
1049         vst1.8  {$tmp0},[$out],#16
1050         vorr    $dat4,$in4,$in4
1051         vst1.8  {$tmp1},[$out],#16
1052         mov     $cnt,$rounds
1053         vst1.8  {$tmp2},[$out],#16
1054         vld1.32 {q9},[$key_],#16                        // re-pre-load rndkey[1]
1055         vst1.8  {$tmp3},[$out],#16
1056         vst1.8  {$tmp4},[$out],#16
1057         b.hs    .Loop5x_ecb_dec
1058
1059         add     $len,$len,#0x50
1060         cbz     $len,.Lecb_done
1061
1062         add     $cnt,$rounds,#2
1063         subs    $len,$len,#0x30
1064         vorr    $dat0,$in2,$in2
1065         vorr    $dat1,$in3,$in3
1066         vorr    $dat2,$in4,$in4
1067         b.lo    .Lecb_dec_tail
1068
1069         b       .Loop3x_ecb_dec
1070
1071 .align  4
1072 .Lecb_tail4x:
1073         veor    $tmp1,$rndlast,$dat1
1074         veor    $tmp2,$rndlast,$dat2
1075         veor    $tmp3,$rndlast,$dat3
1076         veor    $tmp4,$rndlast,$dat4
1077         vst1.8  {$tmp1},[$out],#16
1078         vst1.8  {$tmp2},[$out],#16
1079         vst1.8  {$tmp3},[$out],#16
1080         vst1.8  {$tmp4},[$out],#16
1081
1082         b       .Lecb_done
1083 .align  4
1084 ___
1085 $code.=<<___;
1086 .Loop3x_ecb_dec:
1087         aesd    $dat0,q8
1088         aesimc  $dat0,$dat0
1089         aesd    $dat1,q8
1090         aesimc  $dat1,$dat1
1091         aesd    $dat2,q8
1092         aesimc  $dat2,$dat2
1093         vld1.32 {q8},[$key_],#16
1094         subs    $cnt,$cnt,#2
1095         aesd    $dat0,q9
1096         aesimc  $dat0,$dat0
1097         aesd    $dat1,q9
1098         aesimc  $dat1,$dat1
1099         aesd    $dat2,q9
1100         aesimc  $dat2,$dat2
1101         vld1.32 {q9},[$key_],#16
1102         b.gt    .Loop3x_ecb_dec
1103
1104         aesd    $dat0,q8
1105         aesimc  $dat0,$dat0
1106         aesd    $dat1,q8
1107         aesimc  $dat1,$dat1
1108         aesd    $dat2,q8
1109         aesimc  $dat2,$dat2
1110         subs    $len,$len,#0x30
1111         mov.lo  x6,$len                         // x6, $cnt, is zero at this point
1112         aesd    $dat0,q9
1113         aesimc  $dat0,$dat0
1114         aesd    $dat1,q9
1115         aesimc  $dat1,$dat1
1116         aesd    $dat2,q9
1117         aesimc  $dat2,$dat2
1118         add     $inp,$inp,x6                    // $inp is adjusted in such way that
1119                                                 // at exit from the loop $dat1-$dat2
1120                                                 // are loaded with last "words"
1121         mov     $key_,$key
1122         aesd    $dat0,q12
1123         aesimc  $dat0,$dat0
1124         aesd    $dat1,q12
1125         aesimc  $dat1,$dat1
1126         aesd    $dat2,q12
1127         aesimc  $dat2,$dat2
1128         vld1.8  {$in0},[$inp],#16
1129         aesd    $dat0,q13
1130         aesimc  $dat0,$dat0
1131         aesd    $dat1,q13
1132         aesimc  $dat1,$dat1
1133         aesd    $dat2,q13
1134         aesimc  $dat2,$dat2
1135         vld1.8  {$in1},[$inp],#16
1136         aesd    $dat0,q14
1137         aesimc  $dat0,$dat0
1138         aesd    $dat1,q14
1139         aesimc  $dat1,$dat1
1140         aesd    $dat2,q14
1141         aesimc  $dat2,$dat2
1142         vld1.8  {$in2},[$inp],#16
1143         aesd    $dat0,q15
1144         aesd    $dat1,q15
1145         aesd    $dat2,q15
1146         vld1.32 {q8},[$key_],#16                        // re-pre-load rndkey[0]
1147         add     $cnt,$rounds,#2
1148         veor    $tmp0,$rndlast,$dat0
1149         veor    $tmp1,$rndlast,$dat1
1150         veor    $dat2,$dat2,$rndlast
1151         vld1.32 {q9},[$key_],#16                        // re-pre-load rndkey[1]
1152         vst1.8  {$tmp0},[$out],#16
1153         vorr    $dat0,$in0,$in0
1154         vst1.8  {$tmp1},[$out],#16
1155         vorr    $dat1,$in1,$in1
1156         vst1.8  {$dat2},[$out],#16
1157         vorr    $dat2,$in2,$in2
1158         b.hs    .Loop3x_ecb_dec
1159
1160         cmn     $len,#0x30
1161         b.eq    .Lecb_done
1162         nop
1163
1164 .Lecb_dec_tail:
1165         aesd    $dat1,q8
1166         aesimc  $dat1,$dat1
1167         aesd    $dat2,q8
1168         aesimc  $dat2,$dat2
1169         vld1.32 {q8},[$key_],#16
1170         subs    $cnt,$cnt,#2
1171         aesd    $dat1,q9
1172         aesimc  $dat1,$dat1
1173         aesd    $dat2,q9
1174         aesimc  $dat2,$dat2
1175         vld1.32 {q9},[$key_],#16
1176         b.gt    .Lecb_dec_tail
1177
1178         aesd    $dat1,q8
1179         aesimc  $dat1,$dat1
1180         aesd    $dat2,q8
1181         aesimc  $dat2,$dat2
1182         aesd    $dat1,q9
1183         aesimc  $dat1,$dat1
1184         aesd    $dat2,q9
1185         aesimc  $dat2,$dat2
1186         aesd    $dat1,q12
1187         aesimc  $dat1,$dat1
1188         aesd    $dat2,q12
1189         aesimc  $dat2,$dat2
1190         cmn     $len,#0x20
1191         aesd    $dat1,q13
1192         aesimc  $dat1,$dat1
1193         aesd    $dat2,q13
1194         aesimc  $dat2,$dat2
1195         aesd    $dat1,q14
1196         aesimc  $dat1,$dat1
1197         aesd    $dat2,q14
1198         aesimc  $dat2,$dat2
1199         aesd    $dat1,q15
1200         aesd    $dat2,q15
1201         b.eq    .Lecb_dec_one
1202         veor    $tmp1,$rndlast,$dat1
1203         veor    $tmp2,$rndlast,$dat2
1204         vst1.8  {$tmp1},[$out],#16
1205         vst1.8  {$tmp2},[$out],#16
1206         b       .Lecb_done
1207
1208 .Lecb_dec_one:
1209         veor    $tmp1,$rndlast,$dat2
1210         vst1.8  {$tmp1},[$out],#16
1211
1212 .Lecb_done:
1213 ___
1214 }
1215 $code.=<<___    if ($flavour !~ /64/);
1216         vldmia  sp!,{d8-d15}
1217         ldmia   sp!,{r4-r8,pc}
1218 ___
1219 $code.=<<___    if ($flavour =~ /64/);
1220         ldr     x29,[sp],#16
1221 ___
1222 $code.=<<___    if ($flavour =~ /64/);
1223 .Lecb_Final_abort:
1224         ret
1225 ___
1226 $code.=<<___;
1227 .size   ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1228 ___
1229 }}}
1230 {{{
1231 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1232 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1233 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1234
1235 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1236 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1237
1238 ### q8-q15      preloaded key schedule
1239
1240 $code.=<<___;
1241 .globl  ${prefix}_cbc_encrypt
1242 .type   ${prefix}_cbc_encrypt,%function
1243 .align  5
1244 ${prefix}_cbc_encrypt:
1245 ___
1246 $code.=<<___    if ($flavour =~ /64/);
1247         AARCH64_VALID_CALL_TARGET
1248         // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1249         stp     x29,x30,[sp,#-16]!
1250         add     x29,sp,#0
1251 ___
1252 $code.=<<___    if ($flavour !~ /64/);
1253         mov     ip,sp
1254         stmdb   sp!,{r4-r8,lr}
1255         vstmdb  sp!,{d8-d15}            @ ABI specification says so
1256         ldmia   ip,{r4-r5}              @ load remaining args
1257 ___
1258 $code.=<<___;
1259         subs    $len,$len,#16
1260         mov     $step,#16
1261         b.lo    .Lcbc_abort
1262         cclr    $step,eq
1263
1264         cmp     $enc,#0                 // en- or decrypting?
1265         ldr     $rounds,[$key,#240]
1266         and     $len,$len,#-16
1267         vld1.8  {$ivec},[$ivp]
1268         vld1.8  {$dat},[$inp],$step
1269
1270         vld1.32 {q8-q9},[$key]          // load key schedule...
1271         sub     $rounds,$rounds,#6
1272         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
1273         sub     $rounds,$rounds,#2
1274         vld1.32 {q10-q11},[$key_],#32
1275         vld1.32 {q12-q13},[$key_],#32
1276         vld1.32 {q14-q15},[$key_],#32
1277         vld1.32 {$rndlast},[$key_]
1278
1279         add     $key_,$key,#32
1280         mov     $cnt,$rounds
1281         b.eq    .Lcbc_dec
1282
1283         cmp     $rounds,#2
1284         veor    $dat,$dat,$ivec
1285         veor    $rndzero_n_last,q8,$rndlast
1286         b.eq    .Lcbc_enc128
1287
1288         vld1.32 {$in0-$in1},[$key_]
1289         add     $key_,$key,#16
1290         add     $key4,$key,#16*4
1291         add     $key5,$key,#16*5
1292         aese    $dat,q8
1293         aesmc   $dat,$dat
1294         add     $key6,$key,#16*6
1295         add     $key7,$key,#16*7
1296         b       .Lenter_cbc_enc
1297
1298 .align  4
1299 .Loop_cbc_enc:
1300         aese    $dat,q8
1301         aesmc   $dat,$dat
1302          vst1.8 {$ivec},[$out],#16
1303 .Lenter_cbc_enc:
1304         aese    $dat,q9
1305         aesmc   $dat,$dat
1306         aese    $dat,$in0
1307         aesmc   $dat,$dat
1308         vld1.32 {q8},[$key4]
1309         cmp     $rounds,#4
1310         aese    $dat,$in1
1311         aesmc   $dat,$dat
1312         vld1.32 {q9},[$key5]
1313         b.eq    .Lcbc_enc192
1314
1315         aese    $dat,q8
1316         aesmc   $dat,$dat
1317         vld1.32 {q8},[$key6]
1318         aese    $dat,q9
1319         aesmc   $dat,$dat
1320         vld1.32 {q9},[$key7]
1321         nop
1322
1323 .Lcbc_enc192:
1324         aese    $dat,q8
1325         aesmc   $dat,$dat
1326          subs   $len,$len,#16
1327         aese    $dat,q9
1328         aesmc   $dat,$dat
1329          cclr   $step,eq
1330         aese    $dat,q10
1331         aesmc   $dat,$dat
1332         aese    $dat,q11
1333         aesmc   $dat,$dat
1334          vld1.8 {q8},[$inp],$step
1335         aese    $dat,q12
1336         aesmc   $dat,$dat
1337          veor   q8,q8,$rndzero_n_last
1338         aese    $dat,q13
1339         aesmc   $dat,$dat
1340          vld1.32 {q9},[$key_]           // re-pre-load rndkey[1]
1341         aese    $dat,q14
1342         aesmc   $dat,$dat
1343         aese    $dat,q15
1344         veor    $ivec,$dat,$rndlast
1345         b.hs    .Loop_cbc_enc
1346
1347         vst1.8  {$ivec},[$out],#16
1348         b       .Lcbc_done
1349
1350 .align  5
1351 .Lcbc_enc128:
1352         vld1.32 {$in0-$in1},[$key_]
1353         aese    $dat,q8
1354         aesmc   $dat,$dat
1355         b       .Lenter_cbc_enc128
1356 .Loop_cbc_enc128:
1357         aese    $dat,q8
1358         aesmc   $dat,$dat
1359          vst1.8 {$ivec},[$out],#16
1360 .Lenter_cbc_enc128:
1361         aese    $dat,q9
1362         aesmc   $dat,$dat
1363          subs   $len,$len,#16
1364         aese    $dat,$in0
1365         aesmc   $dat,$dat
1366          cclr   $step,eq
1367         aese    $dat,$in1
1368         aesmc   $dat,$dat
1369         aese    $dat,q10
1370         aesmc   $dat,$dat
1371         aese    $dat,q11
1372         aesmc   $dat,$dat
1373          vld1.8 {q8},[$inp],$step
1374         aese    $dat,q12
1375         aesmc   $dat,$dat
1376         aese    $dat,q13
1377         aesmc   $dat,$dat
1378         aese    $dat,q14
1379         aesmc   $dat,$dat
1380          veor   q8,q8,$rndzero_n_last
1381         aese    $dat,q15
1382         veor    $ivec,$dat,$rndlast
1383         b.hs    .Loop_cbc_enc128
1384
1385         vst1.8  {$ivec},[$out],#16
1386         b       .Lcbc_done
1387 ___
1388 {
1389 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1390
1391 my ($dat3,$in3,$tmp3);  # used only in 64-bit mode
1392 my ($dat4,$in4,$tmp4);
1393 if ($flavour =~ /64/) {
1394     ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1395 }
1396
1397 $code.=<<___;
1398 .align  5
1399 .Lcbc_dec:
1400         vld1.8  {$dat2},[$inp],#16
1401         subs    $len,$len,#32           // bias
1402         add     $cnt,$rounds,#2
1403         vorr    $in1,$dat,$dat
1404         vorr    $dat1,$dat,$dat
1405         vorr    $in2,$dat2,$dat2
1406         b.lo    .Lcbc_dec_tail
1407
1408         vorr    $dat1,$dat2,$dat2
1409         vld1.8  {$dat2},[$inp],#16
1410         vorr    $in0,$dat,$dat
1411         vorr    $in1,$dat1,$dat1
1412         vorr    $in2,$dat2,$dat2
1413 ___
1414 $code.=<<___    if ($flavour =~ /64/);
1415         cmp     $len,#32
1416         b.lo    .Loop3x_cbc_dec
1417
1418         vld1.8  {$dat3},[$inp],#16
1419         vld1.8  {$dat4},[$inp],#16
1420         sub     $len,$len,#32           // bias
1421         mov     $cnt,$rounds
1422         vorr    $in3,$dat3,$dat3
1423         vorr    $in4,$dat4,$dat4
1424
1425 .Loop5x_cbc_dec:
1426         aesd    $dat0,q8
1427         aesimc  $dat0,$dat0
1428         aesd    $dat1,q8
1429         aesimc  $dat1,$dat1
1430         aesd    $dat2,q8
1431         aesimc  $dat2,$dat2
1432         aesd    $dat3,q8
1433         aesimc  $dat3,$dat3
1434         aesd    $dat4,q8
1435         aesimc  $dat4,$dat4
1436         vld1.32 {q8},[$key_],#16
1437         subs    $cnt,$cnt,#2
1438         aesd    $dat0,q9
1439         aesimc  $dat0,$dat0
1440         aesd    $dat1,q9
1441         aesimc  $dat1,$dat1
1442         aesd    $dat2,q9
1443         aesimc  $dat2,$dat2
1444         aesd    $dat3,q9
1445         aesimc  $dat3,$dat3
1446         aesd    $dat4,q9
1447         aesimc  $dat4,$dat4
1448         vld1.32 {q9},[$key_],#16
1449         b.gt    .Loop5x_cbc_dec
1450
1451         aesd    $dat0,q8
1452         aesimc  $dat0,$dat0
1453         aesd    $dat1,q8
1454         aesimc  $dat1,$dat1
1455         aesd    $dat2,q8
1456         aesimc  $dat2,$dat2
1457         aesd    $dat3,q8
1458         aesimc  $dat3,$dat3
1459         aesd    $dat4,q8
1460         aesimc  $dat4,$dat4
1461          cmp    $len,#0x40              // because .Lcbc_tail4x
1462          sub    $len,$len,#0x50
1463
1464         aesd    $dat0,q9
1465         aesimc  $dat0,$dat0
1466         aesd    $dat1,q9
1467         aesimc  $dat1,$dat1
1468         aesd    $dat2,q9
1469         aesimc  $dat2,$dat2
1470         aesd    $dat3,q9
1471         aesimc  $dat3,$dat3
1472         aesd    $dat4,q9
1473         aesimc  $dat4,$dat4
1474          csel   x6,xzr,$len,gt          // borrow x6, $cnt, "gt" is not typo
1475          mov    $key_,$key
1476
1477         aesd    $dat0,q10
1478         aesimc  $dat0,$dat0
1479         aesd    $dat1,q10
1480         aesimc  $dat1,$dat1
1481         aesd    $dat2,q10
1482         aesimc  $dat2,$dat2
1483         aesd    $dat3,q10
1484         aesimc  $dat3,$dat3
1485         aesd    $dat4,q10
1486         aesimc  $dat4,$dat4
1487          add    $inp,$inp,x6            // $inp is adjusted in such way that
1488                                         // at exit from the loop $dat1-$dat4
1489                                         // are loaded with last "words"
1490          add    x6,$len,#0x60           // because .Lcbc_tail4x
1491
1492         aesd    $dat0,q11
1493         aesimc  $dat0,$dat0
1494         aesd    $dat1,q11
1495         aesimc  $dat1,$dat1
1496         aesd    $dat2,q11
1497         aesimc  $dat2,$dat2
1498         aesd    $dat3,q11
1499         aesimc  $dat3,$dat3
1500         aesd    $dat4,q11
1501         aesimc  $dat4,$dat4
1502
1503         aesd    $dat0,q12
1504         aesimc  $dat0,$dat0
1505         aesd    $dat1,q12
1506         aesimc  $dat1,$dat1
1507         aesd    $dat2,q12
1508         aesimc  $dat2,$dat2
1509         aesd    $dat3,q12
1510         aesimc  $dat3,$dat3
1511         aesd    $dat4,q12
1512         aesimc  $dat4,$dat4
1513
1514         aesd    $dat0,q13
1515         aesimc  $dat0,$dat0
1516         aesd    $dat1,q13
1517         aesimc  $dat1,$dat1
1518         aesd    $dat2,q13
1519         aesimc  $dat2,$dat2
1520         aesd    $dat3,q13
1521         aesimc  $dat3,$dat3
1522         aesd    $dat4,q13
1523         aesimc  $dat4,$dat4
1524
1525         aesd    $dat0,q14
1526         aesimc  $dat0,$dat0
1527         aesd    $dat1,q14
1528         aesimc  $dat1,$dat1
1529         aesd    $dat2,q14
1530         aesimc  $dat2,$dat2
1531         aesd    $dat3,q14
1532         aesimc  $dat3,$dat3
1533         aesd    $dat4,q14
1534         aesimc  $dat4,$dat4
1535
1536          veor   $tmp0,$ivec,$rndlast
1537         aesd    $dat0,q15
1538          veor   $tmp1,$in0,$rndlast
1539          vld1.8 {$in0},[$inp],#16
1540         aesd    $dat1,q15
1541          veor   $tmp2,$in1,$rndlast
1542          vld1.8 {$in1},[$inp],#16
1543         aesd    $dat2,q15
1544          veor   $tmp3,$in2,$rndlast
1545          vld1.8 {$in2},[$inp],#16
1546         aesd    $dat3,q15
1547          veor   $tmp4,$in3,$rndlast
1548          vld1.8 {$in3},[$inp],#16
1549         aesd    $dat4,q15
1550          vorr   $ivec,$in4,$in4
1551          vld1.8 {$in4},[$inp],#16
1552         cbz     x6,.Lcbc_tail4x
1553          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
1554         veor    $tmp0,$tmp0,$dat0
1555          vorr   $dat0,$in0,$in0
1556         veor    $tmp1,$tmp1,$dat1
1557          vorr   $dat1,$in1,$in1
1558         veor    $tmp2,$tmp2,$dat2
1559          vorr   $dat2,$in2,$in2
1560         veor    $tmp3,$tmp3,$dat3
1561          vorr   $dat3,$in3,$in3
1562         veor    $tmp4,$tmp4,$dat4
1563         vst1.8  {$tmp0},[$out],#16
1564          vorr   $dat4,$in4,$in4
1565         vst1.8  {$tmp1},[$out],#16
1566          mov    $cnt,$rounds
1567         vst1.8  {$tmp2},[$out],#16
1568          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
1569         vst1.8  {$tmp3},[$out],#16
1570         vst1.8  {$tmp4},[$out],#16
1571         b.hs    .Loop5x_cbc_dec
1572
1573         add     $len,$len,#0x50
1574         cbz     $len,.Lcbc_done
1575
1576         add     $cnt,$rounds,#2
1577         subs    $len,$len,#0x30
1578         vorr    $dat0,$in2,$in2
1579         vorr    $in0,$in2,$in2
1580         vorr    $dat1,$in3,$in3
1581         vorr    $in1,$in3,$in3
1582         vorr    $dat2,$in4,$in4
1583         vorr    $in2,$in4,$in4
1584         b.lo    .Lcbc_dec_tail
1585
1586         b       .Loop3x_cbc_dec
1587
1588 .align  4
1589 .Lcbc_tail4x:
1590         veor    $tmp1,$tmp0,$dat1
1591         veor    $tmp2,$tmp2,$dat2
1592         veor    $tmp3,$tmp3,$dat3
1593         veor    $tmp4,$tmp4,$dat4
1594         vst1.8  {$tmp1},[$out],#16
1595         vst1.8  {$tmp2},[$out],#16
1596         vst1.8  {$tmp3},[$out],#16
1597         vst1.8  {$tmp4},[$out],#16
1598
1599         b       .Lcbc_done
1600 .align  4
1601 ___
1602 $code.=<<___;
1603 .Loop3x_cbc_dec:
1604         aesd    $dat0,q8
1605         aesimc  $dat0,$dat0
1606         aesd    $dat1,q8
1607         aesimc  $dat1,$dat1
1608         aesd    $dat2,q8
1609         aesimc  $dat2,$dat2
1610         vld1.32 {q8},[$key_],#16
1611         subs    $cnt,$cnt,#2
1612         aesd    $dat0,q9
1613         aesimc  $dat0,$dat0
1614         aesd    $dat1,q9
1615         aesimc  $dat1,$dat1
1616         aesd    $dat2,q9
1617         aesimc  $dat2,$dat2
1618         vld1.32 {q9},[$key_],#16
1619         b.gt    .Loop3x_cbc_dec
1620
1621         aesd    $dat0,q8
1622         aesimc  $dat0,$dat0
1623         aesd    $dat1,q8
1624         aesimc  $dat1,$dat1
1625         aesd    $dat2,q8
1626         aesimc  $dat2,$dat2
1627          veor   $tmp0,$ivec,$rndlast
1628          subs   $len,$len,#0x30
1629          veor   $tmp1,$in0,$rndlast
1630          mov.lo x6,$len                 // x6, $cnt, is zero at this point
1631         aesd    $dat0,q9
1632         aesimc  $dat0,$dat0
1633         aesd    $dat1,q9
1634         aesimc  $dat1,$dat1
1635         aesd    $dat2,q9
1636         aesimc  $dat2,$dat2
1637          veor   $tmp2,$in1,$rndlast
1638          add    $inp,$inp,x6            // $inp is adjusted in such way that
1639                                         // at exit from the loop $dat1-$dat2
1640                                         // are loaded with last "words"
1641          vorr   $ivec,$in2,$in2
1642          mov    $key_,$key
1643         aesd    $dat0,q12
1644         aesimc  $dat0,$dat0
1645         aesd    $dat1,q12
1646         aesimc  $dat1,$dat1
1647         aesd    $dat2,q12
1648         aesimc  $dat2,$dat2
1649          vld1.8 {$in0},[$inp],#16
1650         aesd    $dat0,q13
1651         aesimc  $dat0,$dat0
1652         aesd    $dat1,q13
1653         aesimc  $dat1,$dat1
1654         aesd    $dat2,q13
1655         aesimc  $dat2,$dat2
1656          vld1.8 {$in1},[$inp],#16
1657         aesd    $dat0,q14
1658         aesimc  $dat0,$dat0
1659         aesd    $dat1,q14
1660         aesimc  $dat1,$dat1
1661         aesd    $dat2,q14
1662         aesimc  $dat2,$dat2
1663          vld1.8 {$in2},[$inp],#16
1664         aesd    $dat0,q15
1665         aesd    $dat1,q15
1666         aesd    $dat2,q15
1667          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
1668          add    $cnt,$rounds,#2
1669         veor    $tmp0,$tmp0,$dat0
1670         veor    $tmp1,$tmp1,$dat1
1671         veor    $dat2,$dat2,$tmp2
1672          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
1673         vst1.8  {$tmp0},[$out],#16
1674          vorr   $dat0,$in0,$in0
1675         vst1.8  {$tmp1},[$out],#16
1676          vorr   $dat1,$in1,$in1
1677         vst1.8  {$dat2},[$out],#16
1678          vorr   $dat2,$in2,$in2
1679         b.hs    .Loop3x_cbc_dec
1680
1681         cmn     $len,#0x30
1682         b.eq    .Lcbc_done
1683         nop
1684
1685 .Lcbc_dec_tail:
1686         aesd    $dat1,q8
1687         aesimc  $dat1,$dat1
1688         aesd    $dat2,q8
1689         aesimc  $dat2,$dat2
1690         vld1.32 {q8},[$key_],#16
1691         subs    $cnt,$cnt,#2
1692         aesd    $dat1,q9
1693         aesimc  $dat1,$dat1
1694         aesd    $dat2,q9
1695         aesimc  $dat2,$dat2
1696         vld1.32 {q9},[$key_],#16
1697         b.gt    .Lcbc_dec_tail
1698
1699         aesd    $dat1,q8
1700         aesimc  $dat1,$dat1
1701         aesd    $dat2,q8
1702         aesimc  $dat2,$dat2
1703         aesd    $dat1,q9
1704         aesimc  $dat1,$dat1
1705         aesd    $dat2,q9
1706         aesimc  $dat2,$dat2
1707         aesd    $dat1,q12
1708         aesimc  $dat1,$dat1
1709         aesd    $dat2,q12
1710         aesimc  $dat2,$dat2
1711          cmn    $len,#0x20
1712         aesd    $dat1,q13
1713         aesimc  $dat1,$dat1
1714         aesd    $dat2,q13
1715         aesimc  $dat2,$dat2
1716          veor   $tmp1,$ivec,$rndlast
1717         aesd    $dat1,q14
1718         aesimc  $dat1,$dat1
1719         aesd    $dat2,q14
1720         aesimc  $dat2,$dat2
1721          veor   $tmp2,$in1,$rndlast
1722         aesd    $dat1,q15
1723         aesd    $dat2,q15
1724         b.eq    .Lcbc_dec_one
1725         veor    $tmp1,$tmp1,$dat1
1726         veor    $tmp2,$tmp2,$dat2
1727          vorr   $ivec,$in2,$in2
1728         vst1.8  {$tmp1},[$out],#16
1729         vst1.8  {$tmp2},[$out],#16
1730         b       .Lcbc_done
1731
1732 .Lcbc_dec_one:
1733         veor    $tmp1,$tmp1,$dat2
1734          vorr   $ivec,$in2,$in2
1735         vst1.8  {$tmp1},[$out],#16
1736
1737 .Lcbc_done:
1738         vst1.8  {$ivec},[$ivp]
1739 .Lcbc_abort:
1740 ___
1741 }
1742 $code.=<<___    if ($flavour !~ /64/);
1743         vldmia  sp!,{d8-d15}
1744         ldmia   sp!,{r4-r8,pc}
1745 ___
1746 $code.=<<___    if ($flavour =~ /64/);
1747         ldr     x29,[sp],#16
1748         ret
1749 ___
1750 $code.=<<___;
1751 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1752 ___
1753 }}}
1754 {{{
1755 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1756 my ($rounds,$cnt,$key_)=("w5","w6","x7");
1757 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1758 my $step="x12";         # aliases with $tctr2
1759
1760 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1761 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1762
1763 # used only in 64-bit mode...
1764 my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
1765
1766 my ($dat,$tmp)=($dat0,$tmp0);
1767
1768 ### q8-q15      preloaded key schedule
1769
1770 $code.=<<___;
1771 .globl  ${prefix}_ctr32_encrypt_blocks
1772 .type   ${prefix}_ctr32_encrypt_blocks,%function
1773 .align  5
1774 ${prefix}_ctr32_encrypt_blocks:
1775 ___
1776 $code.=<<___    if ($flavour =~ /64/);
1777         AARCH64_VALID_CALL_TARGET
1778         // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1779         stp             x29,x30,[sp,#-16]!
1780         add             x29,sp,#0
1781 ___
1782 $code.=<<___    if ($flavour !~ /64/);
1783         mov             ip,sp
1784         stmdb           sp!,{r4-r10,lr}
1785         vstmdb          sp!,{d8-d15}            @ ABI specification says so
1786         ldr             r4, [ip]                @ load remaining arg
1787 ___
1788 $code.=<<___;
1789         ldr             $rounds,[$key,#240]
1790
1791         ldr             $ctr, [$ivp, #12]
1792 #ifdef __ARMEB__
1793         vld1.8          {$dat0},[$ivp]
1794 #else
1795         vld1.32         {$dat0},[$ivp]
1796 #endif
1797         vld1.32         {q8-q9},[$key]          // load key schedule...
1798         sub             $rounds,$rounds,#4
1799         mov             $step,#16
1800         cmp             $len,#2
1801         add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
1802         sub             $rounds,$rounds,#2
1803         vld1.32         {q12-q13},[$key_],#32
1804         vld1.32         {q14-q15},[$key_],#32
1805         vld1.32         {$rndlast},[$key_]
1806         add             $key_,$key,#32
1807         mov             $cnt,$rounds
1808         cclr            $step,lo
1809 #ifndef __ARMEB__
1810         rev             $ctr, $ctr
1811 #endif
1812         add             $tctr1, $ctr, #1
1813         vorr            $ivec,$dat0,$dat0
1814         rev             $tctr1, $tctr1
1815         vmov.32         ${ivec}[3],$tctr1
1816         add             $ctr, $ctr, #2
1817         vorr            $dat1,$ivec,$ivec
1818         b.ls            .Lctr32_tail
1819         rev             $tctr2, $ctr
1820         vmov.32         ${ivec}[3],$tctr2
1821         sub             $len,$len,#3            // bias
1822         vorr            $dat2,$ivec,$ivec
1823 ___
1824 $code.=<<___    if ($flavour =~ /64/);
1825         cmp             $len,#32
1826         b.lo            .Loop3x_ctr32
1827
1828         add             w13,$ctr,#1
1829         add             w14,$ctr,#2
1830         vorr            $dat3,$dat0,$dat0
1831         rev             w13,w13
1832         vorr            $dat4,$dat0,$dat0
1833         rev             w14,w14
1834         vmov.32         ${dat3}[3],w13
1835         sub             $len,$len,#2            // bias
1836         vmov.32         ${dat4}[3],w14
1837         add             $ctr,$ctr,#2
1838         b               .Loop5x_ctr32
1839
1840 .align  4
1841 .Loop5x_ctr32:
1842         aese            $dat0,q8
1843         aesmc           $dat0,$dat0
1844         aese            $dat1,q8
1845         aesmc           $dat1,$dat1
1846         aese            $dat2,q8
1847         aesmc           $dat2,$dat2
1848         aese            $dat3,q8
1849         aesmc           $dat3,$dat3
1850         aese            $dat4,q8
1851         aesmc           $dat4,$dat4
1852         vld1.32         {q8},[$key_],#16
1853         subs            $cnt,$cnt,#2
1854         aese            $dat0,q9
1855         aesmc           $dat0,$dat0
1856         aese            $dat1,q9
1857         aesmc           $dat1,$dat1
1858         aese            $dat2,q9
1859         aesmc           $dat2,$dat2
1860         aese            $dat3,q9
1861         aesmc           $dat3,$dat3
1862         aese            $dat4,q9
1863         aesmc           $dat4,$dat4
1864         vld1.32         {q9},[$key_],#16
1865         b.gt            .Loop5x_ctr32
1866
1867         mov             $key_,$key
1868         aese            $dat0,q8
1869         aesmc           $dat0,$dat0
1870         aese            $dat1,q8
1871         aesmc           $dat1,$dat1
1872         aese            $dat2,q8
1873         aesmc           $dat2,$dat2
1874         aese            $dat3,q8
1875         aesmc           $dat3,$dat3
1876         aese            $dat4,q8
1877         aesmc           $dat4,$dat4
1878         vld1.32         {q8},[$key_],#16        // re-pre-load rndkey[0]
1879
1880         aese            $dat0,q9
1881         aesmc           $dat0,$dat0
1882         aese            $dat1,q9
1883         aesmc           $dat1,$dat1
1884         aese            $dat2,q9
1885         aesmc           $dat2,$dat2
1886         aese            $dat3,q9
1887         aesmc           $dat3,$dat3
1888         aese            $dat4,q9
1889         aesmc           $dat4,$dat4
1890         vld1.32         {q9},[$key_],#16        // re-pre-load rndkey[1]
1891
1892         aese            $dat0,q12
1893         aesmc           $dat0,$dat0
1894          add            $tctr0,$ctr,#1
1895          add            $tctr1,$ctr,#2
1896         aese            $dat1,q12
1897         aesmc           $dat1,$dat1
1898          add            $tctr2,$ctr,#3
1899          add            w13,$ctr,#4
1900         aese            $dat2,q12
1901         aesmc           $dat2,$dat2
1902          add            w14,$ctr,#5
1903          rev            $tctr0,$tctr0
1904         aese            $dat3,q12
1905         aesmc           $dat3,$dat3
1906          rev            $tctr1,$tctr1
1907          rev            $tctr2,$tctr2
1908         aese            $dat4,q12
1909         aesmc           $dat4,$dat4
1910          rev            w13,w13
1911          rev            w14,w14
1912
1913         aese            $dat0,q13
1914         aesmc           $dat0,$dat0
1915         aese            $dat1,q13
1916         aesmc           $dat1,$dat1
1917         aese            $dat2,q13
1918         aesmc           $dat2,$dat2
1919         aese            $dat3,q13
1920         aesmc           $dat3,$dat3
1921         aese            $dat4,q13
1922         aesmc           $dat4,$dat4
1923
1924         aese            $dat0,q14
1925         aesmc           $dat0,$dat0
1926          vld1.8         {$in0},[$inp],#16
1927         aese            $dat1,q14
1928         aesmc           $dat1,$dat1
1929          vld1.8         {$in1},[$inp],#16
1930         aese            $dat2,q14
1931         aesmc           $dat2,$dat2
1932          vld1.8         {$in2},[$inp],#16
1933         aese            $dat3,q14
1934         aesmc           $dat3,$dat3
1935          vld1.8         {$in3},[$inp],#16
1936         aese            $dat4,q14
1937         aesmc           $dat4,$dat4
1938          vld1.8         {$in4},[$inp],#16
1939
1940         aese            $dat0,q15
1941          veor           $in0,$in0,$rndlast
1942         aese            $dat1,q15
1943          veor           $in1,$in1,$rndlast
1944         aese            $dat2,q15
1945          veor           $in2,$in2,$rndlast
1946         aese            $dat3,q15
1947          veor           $in3,$in3,$rndlast
1948         aese            $dat4,q15
1949          veor           $in4,$in4,$rndlast
1950
1951         veor            $in0,$in0,$dat0
1952          vorr           $dat0,$ivec,$ivec
1953         veor            $in1,$in1,$dat1
1954          vorr           $dat1,$ivec,$ivec
1955         veor            $in2,$in2,$dat2
1956          vorr           $dat2,$ivec,$ivec
1957         veor            $in3,$in3,$dat3
1958          vorr           $dat3,$ivec,$ivec
1959         veor            $in4,$in4,$dat4
1960          vorr           $dat4,$ivec,$ivec
1961
1962         vst1.8          {$in0},[$out],#16
1963          vmov.32        ${dat0}[3],$tctr0
1964         vst1.8          {$in1},[$out],#16
1965          vmov.32        ${dat1}[3],$tctr1
1966         vst1.8          {$in2},[$out],#16
1967          vmov.32        ${dat2}[3],$tctr2
1968         vst1.8          {$in3},[$out],#16
1969          vmov.32        ${dat3}[3],w13
1970         vst1.8          {$in4},[$out],#16
1971          vmov.32        ${dat4}[3],w14
1972
1973         mov             $cnt,$rounds
1974         cbz             $len,.Lctr32_done
1975
1976         add             $ctr,$ctr,#5
1977         subs            $len,$len,#5
1978         b.hs            .Loop5x_ctr32
1979
1980         add             $len,$len,#5
1981         sub             $ctr,$ctr,#5
1982
1983         cmp             $len,#2
1984         mov             $step,#16
1985         cclr            $step,lo
1986         b.ls            .Lctr32_tail
1987
1988         sub             $len,$len,#3            // bias
1989         add             $ctr,$ctr,#3
1990 ___
1991 $code.=<<___;
1992         b               .Loop3x_ctr32
1993
1994 .align  4
1995 .Loop3x_ctr32:
1996         aese            $dat0,q8
1997         aesmc           $dat0,$dat0
1998         aese            $dat1,q8
1999         aesmc           $dat1,$dat1
2000         aese            $dat2,q8
2001         aesmc           $dat2,$dat2
2002         vld1.32         {q8},[$key_],#16
2003         subs            $cnt,$cnt,#2
2004         aese            $dat0,q9
2005         aesmc           $dat0,$dat0
2006         aese            $dat1,q9
2007         aesmc           $dat1,$dat1
2008         aese            $dat2,q9
2009         aesmc           $dat2,$dat2
2010         vld1.32         {q9},[$key_],#16
2011         b.gt            .Loop3x_ctr32
2012
2013         aese            $dat0,q8
2014         aesmc           $tmp0,$dat0
2015         aese            $dat1,q8
2016         aesmc           $tmp1,$dat1
2017          vld1.8         {$in0},[$inp],#16
2018          add            $tctr0,$ctr,#1
2019         aese            $dat2,q8
2020         aesmc           $dat2,$dat2
2021          vld1.8         {$in1},[$inp],#16
2022          rev            $tctr0,$tctr0
2023         aese            $tmp0,q9
2024         aesmc           $tmp0,$tmp0
2025         aese            $tmp1,q9
2026         aesmc           $tmp1,$tmp1
2027          vld1.8         {$in2},[$inp],#16
2028          mov            $key_,$key
2029         aese            $dat2,q9
2030         aesmc           $tmp2,$dat2
2031         aese            $tmp0,q12
2032         aesmc           $tmp0,$tmp0
2033         aese            $tmp1,q12
2034         aesmc           $tmp1,$tmp1
2035          veor           $in0,$in0,$rndlast
2036          add            $tctr1,$ctr,#2
2037         aese            $tmp2,q12
2038         aesmc           $tmp2,$tmp2
2039          veor           $in1,$in1,$rndlast
2040          add            $ctr,$ctr,#3
2041         aese            $tmp0,q13
2042         aesmc           $tmp0,$tmp0
2043         aese            $tmp1,q13
2044         aesmc           $tmp1,$tmp1
2045          veor           $in2,$in2,$rndlast
2046          vmov.32        ${ivec}[3], $tctr0
2047         aese            $tmp2,q13
2048         aesmc           $tmp2,$tmp2
2049          vorr           $dat0,$ivec,$ivec
2050          rev            $tctr1,$tctr1
2051         aese            $tmp0,q14
2052         aesmc           $tmp0,$tmp0
2053          vmov.32        ${ivec}[3], $tctr1
2054          rev            $tctr2,$ctr
2055         aese            $tmp1,q14
2056         aesmc           $tmp1,$tmp1
2057          vorr           $dat1,$ivec,$ivec
2058          vmov.32        ${ivec}[3], $tctr2
2059         aese            $tmp2,q14
2060         aesmc           $tmp2,$tmp2
2061          vorr           $dat2,$ivec,$ivec
2062          subs           $len,$len,#3
2063         aese            $tmp0,q15
2064         aese            $tmp1,q15
2065         aese            $tmp2,q15
2066
2067         veor            $in0,$in0,$tmp0
2068          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
2069         vst1.8          {$in0},[$out],#16
2070         veor            $in1,$in1,$tmp1
2071          mov            $cnt,$rounds
2072         vst1.8          {$in1},[$out],#16
2073         veor            $in2,$in2,$tmp2
2074          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
2075         vst1.8          {$in2},[$out],#16
2076         b.hs            .Loop3x_ctr32
2077
2078         adds            $len,$len,#3
2079         b.eq            .Lctr32_done
2080         cmp             $len,#1
2081         mov             $step,#16
2082         cclr            $step,eq
2083
2084 .Lctr32_tail:
2085         aese            $dat0,q8
2086         aesmc           $dat0,$dat0
2087         aese            $dat1,q8
2088         aesmc           $dat1,$dat1
2089         vld1.32         {q8},[$key_],#16
2090         subs            $cnt,$cnt,#2
2091         aese            $dat0,q9
2092         aesmc           $dat0,$dat0
2093         aese            $dat1,q9
2094         aesmc           $dat1,$dat1
2095         vld1.32         {q9},[$key_],#16
2096         b.gt            .Lctr32_tail
2097
2098         aese            $dat0,q8
2099         aesmc           $dat0,$dat0
2100         aese            $dat1,q8
2101         aesmc           $dat1,$dat1
2102         aese            $dat0,q9
2103         aesmc           $dat0,$dat0
2104         aese            $dat1,q9
2105         aesmc           $dat1,$dat1
2106          vld1.8         {$in0},[$inp],$step
2107         aese            $dat0,q12
2108         aesmc           $dat0,$dat0
2109         aese            $dat1,q12
2110         aesmc           $dat1,$dat1
2111          vld1.8         {$in1},[$inp]
2112         aese            $dat0,q13
2113         aesmc           $dat0,$dat0
2114         aese            $dat1,q13
2115         aesmc           $dat1,$dat1
2116          veor           $in0,$in0,$rndlast
2117         aese            $dat0,q14
2118         aesmc           $dat0,$dat0
2119         aese            $dat1,q14
2120         aesmc           $dat1,$dat1
2121          veor           $in1,$in1,$rndlast
2122         aese            $dat0,q15
2123         aese            $dat1,q15
2124
2125         cmp             $len,#1
2126         veor            $in0,$in0,$dat0
2127         veor            $in1,$in1,$dat1
2128         vst1.8          {$in0},[$out],#16
2129         b.eq            .Lctr32_done
2130         vst1.8          {$in1},[$out]
2131
2132 .Lctr32_done:
2133 ___
2134 $code.=<<___    if ($flavour !~ /64/);
2135         vldmia          sp!,{d8-d15}
2136         ldmia           sp!,{r4-r10,pc}
2137 ___
2138 $code.=<<___    if ($flavour =~ /64/);
2139         ldr             x29,[sp],#16
2140         ret
2141 ___
2142 $code.=<<___;
2143 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2144 ___
2145 }}}
2146 # Performance in cycles per byte.
2147 # Processed with AES-XTS different key size.
2148 # It shows the value before and after optimization as below:
2149 # (before/after):
2150 #
2151 #               AES-128-XTS             AES-256-XTS
2152 # Cortex-A57    3.36/1.09               4.02/1.37
2153 # Cortex-A72    3.03/1.02               3.28/1.33
2154
2155 # Optimization is implemented by loop unrolling and interleaving.
2156 # Commonly, we choose the unrolling factor as 5, if the input
2157 # data size smaller than 5 blocks, but not smaller than 3 blocks,
2158 # choose 3 as the unrolling factor.
2159 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
2160 # as one iteration, every loop the left size lsize -= 5*16.
2161 # If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
2162 # will be processed specially, which be integrated into the 5*16 bytes
2163 # loop to improve the efficiency.
2164 # There is one special case, if the original input data size dsize
2165 # = 16 bytes, we will treat it separately to improve the
2166 # performance: one independent code block without LR, FP load and
2167 # store.
2168 # Encryption will process the (length -tailcnt) bytes as mentioned
2169 # previously, then encrypt the composite block as last second
2170 # cipher block.
2171 # Decryption will process the (length -tailcnt -1) bytes as mentioned
2172 # previously, then decrypt the last second cipher block to get the
2173 # last plain block(tail), decrypt the composite block as last second
2174 # plain text block.
2175
2176 {{{
2177 my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2178 my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2179 my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2180 my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2181 my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2182 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2183 my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
2184 my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2185 my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2186
2187 my ($tmpin)=("v26.16b");
2188 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2189
2190 # q7    last round key
2191 # q10-q15, q7   Last 7 round keys
2192 # q8-q9 preloaded round keys except last 7 keys for big size
2193 # q20, q21, q8-q9       preloaded round keys except last 7 keys for only 16 byte
2194
2195
2196 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2197
2198 my ($dat3,$in3,$tmp3);  # used only in 64-bit mode
2199 my ($dat4,$in4,$tmp4);
2200 if ($flavour =~ /64/) {
2201     ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2202 }
2203
2204 $code.=<<___    if ($flavour =~ /64/);
2205 .globl  ${prefix}_xts_encrypt
2206 .type   ${prefix}_xts_encrypt,%function
2207 .align  5
2208 ${prefix}_xts_encrypt:
2209 ___
2210 $code.=<<___    if ($flavour =~ /64/);
2211         AARCH64_VALID_CALL_TARGET
2212         cmp     $len,#16
2213         // Original input data size bigger than 16, jump to big size processing.
2214         b.ne    .Lxts_enc_big_size
2215         // Encrypt the iv with key2, as the first XEX iv.
2216         ldr     $rounds,[$key2,#240]
2217         vld1.8  {$dat},[$key2],#16
2218         vld1.8  {$iv0},[$ivp]
2219         sub     $rounds,$rounds,#2
2220         vld1.8  {$dat1},[$key2],#16
2221
2222 .Loop_enc_iv_enc:
2223         aese    $iv0,$dat
2224         aesmc   $iv0,$iv0
2225         vld1.32 {$dat},[$key2],#16
2226         subs    $rounds,$rounds,#2
2227         aese    $iv0,$dat1
2228         aesmc   $iv0,$iv0
2229         vld1.32 {$dat1},[$key2],#16
2230         b.gt    .Loop_enc_iv_enc
2231
2232         aese    $iv0,$dat
2233         aesmc   $iv0,$iv0
2234         vld1.32 {$dat},[$key2]
2235         aese    $iv0,$dat1
2236         veor    $iv0,$iv0,$dat
2237
2238         vld1.8  {$dat0},[$inp]
2239         veor    $dat0,$iv0,$dat0
2240
2241         ldr     $rounds,[$key1,#240]
2242         vld1.32 {q20-q21},[$key1],#32           // load key schedule...
2243
2244         aese    $dat0,q20
2245         aesmc   $dat0,$dat0
2246         vld1.32 {q8-q9},[$key1],#32             // load key schedule...
2247         aese    $dat0,q21
2248         aesmc   $dat0,$dat0
2249         subs    $rounds,$rounds,#10             // if rounds==10, jump to aes-128-xts processing
2250         b.eq    .Lxts_128_enc
2251 .Lxts_enc_round_loop:
2252         aese    $dat0,q8
2253         aesmc   $dat0,$dat0
2254         vld1.32 {q8},[$key1],#16                // load key schedule...
2255         aese    $dat0,q9
2256         aesmc   $dat0,$dat0
2257         vld1.32 {q9},[$key1],#16                // load key schedule...
2258         subs    $rounds,$rounds,#2              // bias
2259         b.gt    .Lxts_enc_round_loop
2260 .Lxts_128_enc:
2261         vld1.32 {q10-q11},[$key1],#32           // load key schedule...
2262         aese    $dat0,q8
2263         aesmc   $dat0,$dat0
2264         aese    $dat0,q9
2265         aesmc   $dat0,$dat0
2266         vld1.32 {q12-q13},[$key1],#32           // load key schedule...
2267         aese    $dat0,q10
2268         aesmc   $dat0,$dat0
2269         aese    $dat0,q11
2270         aesmc   $dat0,$dat0
2271         vld1.32 {q14-q15},[$key1],#32           // load key schedule...
2272         aese    $dat0,q12
2273         aesmc   $dat0,$dat0
2274         aese    $dat0,q13
2275         aesmc   $dat0,$dat0
2276         vld1.32 {$rndlast},[$key1]
2277         aese    $dat0,q14
2278         aesmc   $dat0,$dat0
2279         aese    $dat0,q15
2280         veor    $dat0,$dat0,$rndlast
2281         veor    $dat0,$dat0,$iv0
2282         vst1.8  {$dat0},[$out]
2283         b       .Lxts_enc_final_abort
2284
2285 .align  4
2286 .Lxts_enc_big_size:
2287 ___
2288 $code.=<<___    if ($flavour =~ /64/);
2289         stp     $constnumx,$tmpinp,[sp,#-64]!
2290         stp     $tailcnt,$midnumx,[sp,#48]
2291         stp     $ivd10,$ivd20,[sp,#32]
2292         stp     $ivd30,$ivd40,[sp,#16]
2293
2294         // tailcnt store the tail value of length%16.
2295         and     $tailcnt,$len,#0xf
2296         and     $len,$len,#-16
2297         subs    $len,$len,#16
2298         mov     $step,#16
2299         b.lo    .Lxts_abort
2300         csel    $step,xzr,$step,eq
2301
2302         // Firstly, encrypt the iv with key2, as the first iv of XEX.
2303         ldr     $rounds,[$key2,#240]
2304         vld1.32 {$dat},[$key2],#16
2305         vld1.8  {$iv0},[$ivp]
2306         sub     $rounds,$rounds,#2
2307         vld1.32 {$dat1},[$key2],#16
2308
2309 .Loop_iv_enc:
2310         aese    $iv0,$dat
2311         aesmc   $iv0,$iv0
2312         vld1.32 {$dat},[$key2],#16
2313         subs    $rounds,$rounds,#2
2314         aese    $iv0,$dat1
2315         aesmc   $iv0,$iv0
2316         vld1.32 {$dat1},[$key2],#16
2317         b.gt    .Loop_iv_enc
2318
2319         aese    $iv0,$dat
2320         aesmc   $iv0,$iv0
2321         vld1.32 {$dat},[$key2]
2322         aese    $iv0,$dat1
2323         veor    $iv0,$iv0,$dat
2324
2325         // The iv for second block
2326         // $ivl- iv(low), $ivh - iv(high)
2327         // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
2328         fmov    $ivl,$ivd00
2329         fmov    $ivh,$ivd01
2330         mov     $constnum,#0x87
2331         extr    $midnumx,$ivh,$ivh,#32
2332         extr    $ivh,$ivh,$ivl,#63
2333         and     $tmpmw,$constnum,$midnum,asr#31
2334         eor     $ivl,$tmpmx,$ivl,lsl#1
2335         fmov    $ivd10,$ivl
2336         fmov    $ivd11,$ivh
2337
2338         ldr     $rounds0,[$key1,#240]           // next starting point
2339         vld1.8  {$dat},[$inp],$step
2340
2341         vld1.32 {q8-q9},[$key1]                 // load key schedule...
2342         sub     $rounds0,$rounds0,#6
2343         add     $key_,$key1,$ivp,lsl#4          // pointer to last 7 round keys
2344         sub     $rounds0,$rounds0,#2
2345         vld1.32 {q10-q11},[$key_],#32
2346         vld1.32 {q12-q13},[$key_],#32
2347         vld1.32 {q14-q15},[$key_],#32
2348         vld1.32 {$rndlast},[$key_]
2349
2350         add     $key_,$key1,#32
2351         mov     $rounds,$rounds0
2352
2353         // Encryption
2354 .Lxts_enc:
2355         vld1.8  {$dat2},[$inp],#16
2356         subs    $len,$len,#32                   // bias
2357         add     $rounds,$rounds0,#2
2358         vorr    $in1,$dat,$dat
2359         vorr    $dat1,$dat,$dat
2360         vorr    $in3,$dat,$dat
2361         vorr    $in2,$dat2,$dat2
2362         vorr    $in4,$dat2,$dat2
2363         b.lo    .Lxts_inner_enc_tail
2364         veor    $dat,$dat,$iv0                  // before encryption, xor with iv
2365         veor    $dat2,$dat2,$iv1
2366
2367         // The iv for third block
2368         extr    $midnumx,$ivh,$ivh,#32
2369         extr    $ivh,$ivh,$ivl,#63
2370         and     $tmpmw,$constnum,$midnum,asr#31
2371         eor     $ivl,$tmpmx,$ivl,lsl#1
2372         fmov    $ivd20,$ivl
2373         fmov    $ivd21,$ivh
2374
2375
2376         vorr    $dat1,$dat2,$dat2
2377         vld1.8  {$dat2},[$inp],#16
2378         vorr    $in0,$dat,$dat
2379         vorr    $in1,$dat1,$dat1
2380         veor    $in2,$dat2,$iv2                 // the third block
2381         veor    $dat2,$dat2,$iv2
2382         cmp     $len,#32
2383         b.lo    .Lxts_outer_enc_tail
2384
2385         // The iv for fourth block
2386         extr    $midnumx,$ivh,$ivh,#32
2387         extr    $ivh,$ivh,$ivl,#63
2388         and     $tmpmw,$constnum,$midnum,asr#31
2389         eor     $ivl,$tmpmx,$ivl,lsl#1
2390         fmov    $ivd30,$ivl
2391         fmov    $ivd31,$ivh
2392
2393         vld1.8  {$dat3},[$inp],#16
2394         // The iv for fifth block
2395         extr    $midnumx,$ivh,$ivh,#32
2396         extr    $ivh,$ivh,$ivl,#63
2397         and     $tmpmw,$constnum,$midnum,asr#31
2398         eor     $ivl,$tmpmx,$ivl,lsl#1
2399         fmov    $ivd40,$ivl
2400         fmov    $ivd41,$ivh
2401
2402         vld1.8  {$dat4},[$inp],#16
2403         veor    $dat3,$dat3,$iv3                // the fourth block
2404         veor    $dat4,$dat4,$iv4
2405         sub     $len,$len,#32                   // bias
2406         mov     $rounds,$rounds0
2407         b       .Loop5x_xts_enc
2408
2409 .align  4
2410 .Loop5x_xts_enc:
2411         aese    $dat0,q8
2412         aesmc   $dat0,$dat0
2413         aese    $dat1,q8
2414         aesmc   $dat1,$dat1
2415         aese    $dat2,q8
2416         aesmc   $dat2,$dat2
2417         aese    $dat3,q8
2418         aesmc   $dat3,$dat3
2419         aese    $dat4,q8
2420         aesmc   $dat4,$dat4
2421         vld1.32 {q8},[$key_],#16
2422         subs    $rounds,$rounds,#2
2423         aese    $dat0,q9
2424         aesmc   $dat0,$dat0
2425         aese    $dat1,q9
2426         aesmc   $dat1,$dat1
2427         aese    $dat2,q9
2428         aesmc   $dat2,$dat2
2429         aese    $dat3,q9
2430         aesmc   $dat3,$dat3
2431         aese    $dat4,q9
2432         aesmc   $dat4,$dat4
2433         vld1.32 {q9},[$key_],#16
2434         b.gt    .Loop5x_xts_enc
2435
2436         aese    $dat0,q8
2437         aesmc   $dat0,$dat0
2438         aese    $dat1,q8
2439         aesmc   $dat1,$dat1
2440         aese    $dat2,q8
2441         aesmc   $dat2,$dat2
2442         aese    $dat3,q8
2443         aesmc   $dat3,$dat3
2444         aese    $dat4,q8
2445         aesmc   $dat4,$dat4
2446         subs    $len,$len,#0x50                 // because .Lxts_enc_tail4x
2447
2448         aese    $dat0,q9
2449         aesmc   $dat0,$dat0
2450         aese    $dat1,q9
2451         aesmc   $dat1,$dat1
2452         aese    $dat2,q9
2453         aesmc   $dat2,$dat2
2454         aese    $dat3,q9
2455         aesmc   $dat3,$dat3
2456         aese    $dat4,q9
2457         aesmc   $dat4,$dat4
2458         csel    $xoffset,xzr,$len,gt            // borrow x6, w6, "gt" is not typo
2459         mov     $key_,$key1
2460
2461         aese    $dat0,q10
2462         aesmc   $dat0,$dat0
2463         aese    $dat1,q10
2464         aesmc   $dat1,$dat1
2465         aese    $dat2,q10
2466         aesmc   $dat2,$dat2
2467         aese    $dat3,q10
2468         aesmc   $dat3,$dat3
2469         aese    $dat4,q10
2470         aesmc   $dat4,$dat4
2471         add     $inp,$inp,$xoffset              // x0 is adjusted in such way that
2472                                                 // at exit from the loop v1.16b-v26.16b
2473                                                 // are loaded with last "words"
2474         add     $xoffset,$len,#0x60             // because .Lxts_enc_tail4x
2475
2476         aese    $dat0,q11
2477         aesmc   $dat0,$dat0
2478         aese    $dat1,q11
2479         aesmc   $dat1,$dat1
2480         aese    $dat2,q11
2481         aesmc   $dat2,$dat2
2482         aese    $dat3,q11
2483         aesmc   $dat3,$dat3
2484         aese    $dat4,q11
2485         aesmc   $dat4,$dat4
2486
2487         aese    $dat0,q12
2488         aesmc   $dat0,$dat0
2489         aese    $dat1,q12
2490         aesmc   $dat1,$dat1
2491         aese    $dat2,q12
2492         aesmc   $dat2,$dat2
2493         aese    $dat3,q12
2494         aesmc   $dat3,$dat3
2495         aese    $dat4,q12
2496         aesmc   $dat4,$dat4
2497
2498         aese    $dat0,q13
2499         aesmc   $dat0,$dat0
2500         aese    $dat1,q13
2501         aesmc   $dat1,$dat1
2502         aese    $dat2,q13
2503         aesmc   $dat2,$dat2
2504         aese    $dat3,q13
2505         aesmc   $dat3,$dat3
2506         aese    $dat4,q13
2507         aesmc   $dat4,$dat4
2508
2509         aese    $dat0,q14
2510         aesmc   $dat0,$dat0
2511         aese    $dat1,q14
2512         aesmc   $dat1,$dat1
2513         aese    $dat2,q14
2514         aesmc   $dat2,$dat2
2515         aese    $dat3,q14
2516         aesmc   $dat3,$dat3
2517         aese    $dat4,q14
2518         aesmc   $dat4,$dat4
2519
2520         veor    $tmp0,$rndlast,$iv0
2521         aese    $dat0,q15
2522         // The iv for first block of one iteration
2523         extr    $midnumx,$ivh,$ivh,#32
2524         extr    $ivh,$ivh,$ivl,#63
2525         and     $tmpmw,$constnum,$midnum,asr#31
2526         eor     $ivl,$tmpmx,$ivl,lsl#1
2527         fmov    $ivd00,$ivl
2528         fmov    $ivd01,$ivh
2529         veor    $tmp1,$rndlast,$iv1
2530         vld1.8  {$in0},[$inp],#16
2531         aese    $dat1,q15
2532         // The iv for second block
2533         extr    $midnumx,$ivh,$ivh,#32
2534         extr    $ivh,$ivh,$ivl,#63
2535         and     $tmpmw,$constnum,$midnum,asr#31
2536         eor     $ivl,$tmpmx,$ivl,lsl#1
2537         fmov    $ivd10,$ivl
2538         fmov    $ivd11,$ivh
2539         veor    $tmp2,$rndlast,$iv2
2540         vld1.8  {$in1},[$inp],#16
2541         aese    $dat2,q15
2542         // The iv for third block
2543         extr    $midnumx,$ivh,$ivh,#32
2544         extr    $ivh,$ivh,$ivl,#63
2545         and     $tmpmw,$constnum,$midnum,asr#31
2546         eor     $ivl,$tmpmx,$ivl,lsl#1
2547         fmov    $ivd20,$ivl
2548         fmov    $ivd21,$ivh
2549         veor    $tmp3,$rndlast,$iv3
2550         vld1.8  {$in2},[$inp],#16
2551         aese    $dat3,q15
2552         // The iv for fourth block
2553         extr    $midnumx,$ivh,$ivh,#32
2554         extr    $ivh,$ivh,$ivl,#63
2555         and     $tmpmw,$constnum,$midnum,asr#31
2556         eor     $ivl,$tmpmx,$ivl,lsl#1
2557         fmov    $ivd30,$ivl
2558         fmov    $ivd31,$ivh
2559         veor    $tmp4,$rndlast,$iv4
2560         vld1.8  {$in3},[$inp],#16
2561         aese    $dat4,q15
2562
2563         // The iv for fifth block
2564         extr    $midnumx,$ivh,$ivh,#32
2565         extr    $ivh,$ivh,$ivl,#63
2566         and     $tmpmw,$constnum,$midnum,asr #31
2567         eor     $ivl,$tmpmx,$ivl,lsl #1
2568         fmov    $ivd40,$ivl
2569         fmov    $ivd41,$ivh
2570
2571         vld1.8  {$in4},[$inp],#16
2572         cbz     $xoffset,.Lxts_enc_tail4x
2573         vld1.32 {q8},[$key_],#16                // re-pre-load rndkey[0]
2574         veor    $tmp0,$tmp0,$dat0
2575         veor    $dat0,$in0,$iv0
2576         veor    $tmp1,$tmp1,$dat1
2577         veor    $dat1,$in1,$iv1
2578         veor    $tmp2,$tmp2,$dat2
2579         veor    $dat2,$in2,$iv2
2580         veor    $tmp3,$tmp3,$dat3
2581         veor    $dat3,$in3,$iv3
2582         veor    $tmp4,$tmp4,$dat4
2583         vst1.8  {$tmp0},[$out],#16
2584         veor    $dat4,$in4,$iv4
2585         vst1.8  {$tmp1},[$out],#16
2586         mov     $rounds,$rounds0
2587         vst1.8  {$tmp2},[$out],#16
2588         vld1.32 {q9},[$key_],#16                // re-pre-load rndkey[1]
2589         vst1.8  {$tmp3},[$out],#16
2590         vst1.8  {$tmp4},[$out],#16
2591         b.hs    .Loop5x_xts_enc
2592
2593
2594         // If left 4 blocks, borrow the five block's processing.
2595         cmn     $len,#0x10
2596         b.ne    .Loop5x_enc_after
2597         vorr    $iv4,$iv3,$iv3
2598         vorr    $iv3,$iv2,$iv2
2599         vorr    $iv2,$iv1,$iv1
2600         vorr    $iv1,$iv0,$iv0
2601         fmov    $ivl,$ivd40
2602         fmov    $ivh,$ivd41
2603         veor    $dat0,$iv0,$in0
2604         veor    $dat1,$iv1,$in1
2605         veor    $dat2,$in2,$iv2
2606         veor    $dat3,$in3,$iv3
2607         veor    $dat4,$in4,$iv4
2608         b.eq    .Loop5x_xts_enc
2609
2610 .Loop5x_enc_after:
2611         add     $len,$len,#0x50
2612         cbz     $len,.Lxts_enc_done
2613
2614         add     $rounds,$rounds0,#2
2615         subs    $len,$len,#0x30
2616         b.lo    .Lxts_inner_enc_tail
2617
2618         veor    $dat0,$iv0,$in2
2619         veor    $dat1,$iv1,$in3
2620         veor    $dat2,$in4,$iv2
2621         b       .Lxts_outer_enc_tail
2622
2623 .align  4
2624 .Lxts_enc_tail4x:
2625         add     $inp,$inp,#16
2626         veor    $tmp1,$dat1,$tmp1
2627         vst1.8  {$tmp1},[$out],#16
2628         veor    $tmp2,$dat2,$tmp2
2629         vst1.8  {$tmp2},[$out],#16
2630         veor    $tmp3,$dat3,$tmp3
2631         veor    $tmp4,$dat4,$tmp4
2632         vst1.8  {$tmp3-$tmp4},[$out],#32
2633
2634         b       .Lxts_enc_done
2635 .align  4
2636 .Lxts_outer_enc_tail:
2637         aese    $dat0,q8
2638         aesmc   $dat0,$dat0
2639         aese    $dat1,q8
2640         aesmc   $dat1,$dat1
2641         aese    $dat2,q8
2642         aesmc   $dat2,$dat2
2643         vld1.32 {q8},[$key_],#16
2644         subs    $rounds,$rounds,#2
2645         aese    $dat0,q9
2646         aesmc   $dat0,$dat0
2647         aese    $dat1,q9
2648         aesmc   $dat1,$dat1
2649         aese    $dat2,q9
2650         aesmc   $dat2,$dat2
2651         vld1.32 {q9},[$key_],#16
2652         b.gt    .Lxts_outer_enc_tail
2653
2654         aese    $dat0,q8
2655         aesmc   $dat0,$dat0
2656         aese    $dat1,q8
2657         aesmc   $dat1,$dat1
2658         aese    $dat2,q8
2659         aesmc   $dat2,$dat2
2660         veor    $tmp0,$iv0,$rndlast
2661         subs    $len,$len,#0x30
2662         // The iv for first block
2663         fmov    $ivl,$ivd20
2664         fmov    $ivh,$ivd21
2665         //mov   $constnum,#0x87
2666         extr    $midnumx,$ivh,$ivh,#32
2667         extr    $ivh,$ivh,$ivl,#63
2668         and     $tmpmw,$constnum,$midnum,asr#31
2669         eor     $ivl,$tmpmx,$ivl,lsl#1
2670         fmov    $ivd00,$ivl
2671         fmov    $ivd01,$ivh
2672         veor    $tmp1,$iv1,$rndlast
2673         csel    $xoffset,$len,$xoffset,lo       // x6, w6, is zero at this point
2674         aese    $dat0,q9
2675         aesmc   $dat0,$dat0
2676         aese    $dat1,q9
2677         aesmc   $dat1,$dat1
2678         aese    $dat2,q9
2679         aesmc   $dat2,$dat2
2680         veor    $tmp2,$iv2,$rndlast
2681
2682         add     $xoffset,$xoffset,#0x20
2683         add     $inp,$inp,$xoffset
2684         mov     $key_,$key1
2685
2686         aese    $dat0,q12
2687         aesmc   $dat0,$dat0
2688         aese    $dat1,q12
2689         aesmc   $dat1,$dat1
2690         aese    $dat2,q12
2691         aesmc   $dat2,$dat2
2692         aese    $dat0,q13
2693         aesmc   $dat0,$dat0
2694         aese    $dat1,q13
2695         aesmc   $dat1,$dat1
2696         aese    $dat2,q13
2697         aesmc   $dat2,$dat2
2698         aese    $dat0,q14
2699         aesmc   $dat0,$dat0
2700         aese    $dat1,q14
2701         aesmc   $dat1,$dat1
2702         aese    $dat2,q14
2703         aesmc   $dat2,$dat2
2704         aese    $dat0,q15
2705         aese    $dat1,q15
2706         aese    $dat2,q15
2707         vld1.8  {$in2},[$inp],#16
2708         add     $rounds,$rounds0,#2
2709         vld1.32 {q8},[$key_],#16                // re-pre-load rndkey[0]
2710         veor    $tmp0,$tmp0,$dat0
2711         veor    $tmp1,$tmp1,$dat1
2712         veor    $dat2,$dat2,$tmp2
2713         vld1.32 {q9},[$key_],#16                // re-pre-load rndkey[1]
2714         vst1.8  {$tmp0},[$out],#16
2715         vst1.8  {$tmp1},[$out],#16
2716         vst1.8  {$dat2},[$out],#16
2717         cmn     $len,#0x30
2718         b.eq    .Lxts_enc_done
2719 .Lxts_encxor_one:
2720         vorr    $in3,$in1,$in1
2721         vorr    $in4,$in2,$in2
2722         nop
2723
2724 .Lxts_inner_enc_tail:
2725         cmn     $len,#0x10
2726         veor    $dat1,$in3,$iv0
2727         veor    $dat2,$in4,$iv1
2728         b.eq    .Lxts_enc_tail_loop
2729         veor    $dat2,$in4,$iv0
2730 .Lxts_enc_tail_loop:
2731         aese    $dat1,q8
2732         aesmc   $dat1,$dat1
2733         aese    $dat2,q8
2734         aesmc   $dat2,$dat2
2735         vld1.32 {q8},[$key_],#16
2736         subs    $rounds,$rounds,#2
2737         aese    $dat1,q9
2738         aesmc   $dat1,$dat1
2739         aese    $dat2,q9
2740         aesmc   $dat2,$dat2
2741         vld1.32 {q9},[$key_],#16
2742         b.gt    .Lxts_enc_tail_loop
2743
2744         aese    $dat1,q8
2745         aesmc   $dat1,$dat1
2746         aese    $dat2,q8
2747         aesmc   $dat2,$dat2
2748         aese    $dat1,q9
2749         aesmc   $dat1,$dat1
2750         aese    $dat2,q9
2751         aesmc   $dat2,$dat2
2752         aese    $dat1,q12
2753         aesmc   $dat1,$dat1
2754         aese    $dat2,q12
2755         aesmc   $dat2,$dat2
2756         cmn     $len,#0x20
2757         aese    $dat1,q13
2758         aesmc   $dat1,$dat1
2759         aese    $dat2,q13
2760         aesmc   $dat2,$dat2
2761         veor    $tmp1,$iv0,$rndlast
2762         aese    $dat1,q14
2763         aesmc   $dat1,$dat1
2764         aese    $dat2,q14
2765         aesmc   $dat2,$dat2
2766         veor    $tmp2,$iv1,$rndlast
2767         aese    $dat1,q15
2768         aese    $dat2,q15
2769         b.eq    .Lxts_enc_one
2770         veor    $tmp1,$tmp1,$dat1
2771         vst1.8  {$tmp1},[$out],#16
2772         veor    $tmp2,$tmp2,$dat2
2773         vorr    $iv0,$iv1,$iv1
2774         vst1.8  {$tmp2},[$out],#16
2775         fmov    $ivl,$ivd10
2776         fmov    $ivh,$ivd11
2777         mov     $constnum,#0x87
2778         extr    $midnumx,$ivh,$ivh,#32
2779         extr    $ivh,$ivh,$ivl,#63
2780         and     $tmpmw,$constnum,$midnum,asr #31
2781         eor     $ivl,$tmpmx,$ivl,lsl #1
2782         fmov    $ivd00,$ivl
2783         fmov    $ivd01,$ivh
2784         b       .Lxts_enc_done
2785
2786 .Lxts_enc_one:
2787         veor    $tmp1,$tmp1,$dat2
2788         vorr    $iv0,$iv0,$iv0
2789         vst1.8  {$tmp1},[$out],#16
2790         fmov    $ivl,$ivd00
2791         fmov    $ivh,$ivd01
2792         mov     $constnum,#0x87
2793         extr    $midnumx,$ivh,$ivh,#32
2794         extr    $ivh,$ivh,$ivl,#63
2795         and     $tmpmw,$constnum,$midnum,asr #31
2796         eor     $ivl,$tmpmx,$ivl,lsl #1
2797         fmov    $ivd00,$ivl
2798         fmov    $ivd01,$ivh
2799         b       .Lxts_enc_done
2800 .align  5
2801 .Lxts_enc_done:
2802         // Process the tail block with cipher stealing.
2803         tst     $tailcnt,#0xf
2804         b.eq    .Lxts_abort
2805
2806         mov     $tmpinp,$inp
2807         mov     $tmpoutp,$out
2808         sub     $out,$out,#16
2809 .composite_enc_loop:
2810         subs    $tailcnt,$tailcnt,#1
2811         ldrb    $l2outp,[$out,$tailcnt]
2812         ldrb    $loutp,[$tmpinp,$tailcnt]
2813         strb    $l2outp,[$tmpoutp,$tailcnt]
2814         strb    $loutp,[$out,$tailcnt]
2815         b.gt    .composite_enc_loop
2816 .Lxts_enc_load_done:
2817         vld1.8  {$tmpin},[$out]
2818         veor    $tmpin,$tmpin,$iv0
2819
2820         // Encrypt the composite block to get the last second encrypted text block
2821         ldr     $rounds,[$key1,#240]            // load key schedule...
2822         vld1.8  {$dat},[$key1],#16
2823         sub     $rounds,$rounds,#2
2824         vld1.8  {$dat1},[$key1],#16             // load key schedule...
2825 .Loop_final_enc:
2826         aese    $tmpin,$dat0
2827         aesmc   $tmpin,$tmpin
2828         vld1.32 {$dat0},[$key1],#16
2829         subs    $rounds,$rounds,#2
2830         aese    $tmpin,$dat1
2831         aesmc   $tmpin,$tmpin
2832         vld1.32 {$dat1},[$key1],#16
2833         b.gt    .Loop_final_enc
2834
2835         aese    $tmpin,$dat0
2836         aesmc   $tmpin,$tmpin
2837         vld1.32 {$dat0},[$key1]
2838         aese    $tmpin,$dat1
2839         veor    $tmpin,$tmpin,$dat0
2840         veor    $tmpin,$tmpin,$iv0
2841         vst1.8  {$tmpin},[$out]
2842
2843 .Lxts_abort:
2844         ldp     $tailcnt,$midnumx,[sp,#48]
2845         ldp     $ivd10,$ivd20,[sp,#32]
2846         ldp     $ivd30,$ivd40,[sp,#16]
2847         ldp     $constnumx,$tmpinp,[sp],#64
2848 .Lxts_enc_final_abort:
2849         ret
2850 .size   ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
2851 ___
2852
2853 }}}
2854 {{{
2855 my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2856 my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2857 my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2858 my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2859 my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2860 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2861 my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
2862 my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2863 my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2864
2865 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2866
2867 # q7    last round key
2868 # q10-q15, q7   Last 7 round keys
2869 # q8-q9 preloaded round keys except last 7 keys for big size
2870 # q20, q21, q8-q9       preloaded round keys except last 7 keys for only 16 byte
2871
2872 {
2873 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2874
2875 my ($dat3,$in3,$tmp3);  # used only in 64-bit mode
2876 my ($dat4,$in4,$tmp4);
2877 if ($flavour =~ /64/) {
2878     ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2879 }
2880
2881 $code.=<<___    if ($flavour =~ /64/);
2882 .globl  ${prefix}_xts_decrypt
2883 .type   ${prefix}_xts_decrypt,%function
2884 .align  5
2885 ${prefix}_xts_decrypt:
2886         AARCH64_VALID_CALL_TARGET
2887 ___
2888 $code.=<<___    if ($flavour =~ /64/);
2889         cmp     $len,#16
2890         // Original input data size bigger than 16, jump to big size processing.
2891         b.ne    .Lxts_dec_big_size
2892         // Encrypt the iv with key2, as the first XEX iv.
2893         ldr     $rounds,[$key2,#240]
2894         vld1.8  {$dat},[$key2],#16
2895         vld1.8  {$iv0},[$ivp]
2896         sub     $rounds,$rounds,#2
2897         vld1.8  {$dat1},[$key2],#16
2898
2899 .Loop_dec_small_iv_enc:
2900         aese    $iv0,$dat
2901         aesmc   $iv0,$iv0
2902         vld1.32 {$dat},[$key2],#16
2903         subs    $rounds,$rounds,#2
2904         aese    $iv0,$dat1
2905         aesmc   $iv0,$iv0
2906         vld1.32 {$dat1},[$key2],#16
2907         b.gt    .Loop_dec_small_iv_enc
2908
2909         aese    $iv0,$dat
2910         aesmc   $iv0,$iv0
2911         vld1.32 {$dat},[$key2]
2912         aese    $iv0,$dat1
2913         veor    $iv0,$iv0,$dat
2914
2915         vld1.8  {$dat0},[$inp]
2916         veor    $dat0,$iv0,$dat0
2917
2918         ldr     $rounds,[$key1,#240]
2919         vld1.32 {q20-q21},[$key1],#32                   // load key schedule...
2920
2921         aesd    $dat0,q20
2922         aesimc  $dat0,$dat0
2923         vld1.32 {q8-q9},[$key1],#32                     // load key schedule...
2924         aesd    $dat0,q21
2925         aesimc  $dat0,$dat0
2926         subs    $rounds,$rounds,#10                     // bias
2927         b.eq    .Lxts_128_dec
2928 .Lxts_dec_round_loop:
2929         aesd    $dat0,q8
2930         aesimc  $dat0,$dat0
2931         vld1.32 {q8},[$key1],#16                        // load key schedule...
2932         aesd    $dat0,q9
2933         aesimc  $dat0,$dat0
2934         vld1.32 {q9},[$key1],#16                        // load key schedule...
2935         subs    $rounds,$rounds,#2                      // bias
2936         b.gt    .Lxts_dec_round_loop
2937 .Lxts_128_dec:
2938         vld1.32 {q10-q11},[$key1],#32                   // load key schedule...
2939         aesd    $dat0,q8
2940         aesimc  $dat0,$dat0
2941         aesd    $dat0,q9
2942         aesimc  $dat0,$dat0
2943         vld1.32 {q12-q13},[$key1],#32                   // load key schedule...
2944         aesd    $dat0,q10
2945         aesimc  $dat0,$dat0
2946         aesd    $dat0,q11
2947         aesimc  $dat0,$dat0
2948         vld1.32 {q14-q15},[$key1],#32                   // load key schedule...
2949         aesd    $dat0,q12
2950         aesimc  $dat0,$dat0
2951         aesd    $dat0,q13
2952         aesimc  $dat0,$dat0
2953         vld1.32 {$rndlast},[$key1]
2954         aesd    $dat0,q14
2955         aesimc  $dat0,$dat0
2956         aesd    $dat0,q15
2957         veor    $dat0,$dat0,$rndlast
2958         veor    $dat0,$iv0,$dat0
2959         vst1.8  {$dat0},[$out]
2960         b       .Lxts_dec_final_abort
2961 .Lxts_dec_big_size:
2962 ___
2963 $code.=<<___    if ($flavour =~ /64/);
2964         stp     $constnumx,$tmpinp,[sp,#-64]!
2965         stp     $tailcnt,$midnumx,[sp,#48]
2966         stp     $ivd10,$ivd20,[sp,#32]
2967         stp     $ivd30,$ivd40,[sp,#16]
2968
2969         and     $tailcnt,$len,#0xf
2970         and     $len,$len,#-16
2971         subs    $len,$len,#16
2972         mov     $step,#16
2973         b.lo    .Lxts_dec_abort
2974
2975         // Encrypt the iv with key2, as the first XEX iv
2976         ldr     $rounds,[$key2,#240]
2977         vld1.8  {$dat},[$key2],#16
2978         vld1.8  {$iv0},[$ivp]
2979         sub     $rounds,$rounds,#2
2980         vld1.8  {$dat1},[$key2],#16
2981
2982 .Loop_dec_iv_enc:
2983         aese    $iv0,$dat
2984         aesmc   $iv0,$iv0
2985         vld1.32 {$dat},[$key2],#16
2986         subs    $rounds,$rounds,#2
2987         aese    $iv0,$dat1
2988         aesmc   $iv0,$iv0
2989         vld1.32 {$dat1},[$key2],#16
2990         b.gt    .Loop_dec_iv_enc
2991
2992         aese    $iv0,$dat
2993         aesmc   $iv0,$iv0
2994         vld1.32 {$dat},[$key2]
2995         aese    $iv0,$dat1
2996         veor    $iv0,$iv0,$dat
2997
2998         // The iv for second block
2999         // $ivl- iv(low), $ivh - iv(high)
3000         // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
3001         fmov    $ivl,$ivd00
3002         fmov    $ivh,$ivd01
3003         mov     $constnum,#0x87
3004         extr    $midnumx,$ivh,$ivh,#32
3005         extr    $ivh,$ivh,$ivl,#63
3006         and     $tmpmw,$constnum,$midnum,asr #31
3007         eor     $ivl,$tmpmx,$ivl,lsl #1
3008         fmov    $ivd10,$ivl
3009         fmov    $ivd11,$ivh
3010
3011         ldr     $rounds0,[$key1,#240]           // load rounds number
3012
3013         // The iv for third block
3014         extr    $midnumx,$ivh,$ivh,#32
3015         extr    $ivh,$ivh,$ivl,#63
3016         and     $tmpmw,$constnum,$midnum,asr #31
3017         eor     $ivl,$tmpmx,$ivl,lsl #1
3018         fmov    $ivd20,$ivl
3019         fmov    $ivd21,$ivh
3020
3021         vld1.32 {q8-q9},[$key1]                 // load key schedule...
3022         sub     $rounds0,$rounds0,#6
3023         add     $key_,$key1,$ivp,lsl#4          // pointer to last 7 round keys
3024         sub     $rounds0,$rounds0,#2
3025         vld1.32 {q10-q11},[$key_],#32           // load key schedule...
3026         vld1.32 {q12-q13},[$key_],#32
3027         vld1.32 {q14-q15},[$key_],#32
3028         vld1.32 {$rndlast},[$key_]
3029
3030         // The iv for fourth block
3031         extr    $midnumx,$ivh,$ivh,#32
3032         extr    $ivh,$ivh,$ivl,#63
3033         and     $tmpmw,$constnum,$midnum,asr #31
3034         eor     $ivl,$tmpmx,$ivl,lsl #1
3035         fmov    $ivd30,$ivl
3036         fmov    $ivd31,$ivh
3037
3038         add     $key_,$key1,#32
3039         mov     $rounds,$rounds0
3040         b       .Lxts_dec
3041
3042         // Decryption
3043 .align  5
3044 .Lxts_dec:
3045         tst     $tailcnt,#0xf
3046         b.eq    .Lxts_dec_begin
3047         subs    $len,$len,#16
3048         csel    $step,xzr,$step,eq
3049         vld1.8  {$dat},[$inp],#16
3050         b.lo    .Lxts_done
3051         sub     $inp,$inp,#16
3052 .Lxts_dec_begin:
3053         vld1.8  {$dat},[$inp],$step
3054         subs    $len,$len,#32                   // bias
3055         add     $rounds,$rounds0,#2
3056         vorr    $in1,$dat,$dat
3057         vorr    $dat1,$dat,$dat
3058         vorr    $in3,$dat,$dat
3059         vld1.8  {$dat2},[$inp],#16
3060         vorr    $in2,$dat2,$dat2
3061         vorr    $in4,$dat2,$dat2
3062         b.lo    .Lxts_inner_dec_tail
3063         veor    $dat,$dat,$iv0                  // before decryt, xor with iv
3064         veor    $dat2,$dat2,$iv1
3065
3066         vorr    $dat1,$dat2,$dat2
3067         vld1.8  {$dat2},[$inp],#16
3068         vorr    $in0,$dat,$dat
3069         vorr    $in1,$dat1,$dat1
3070         veor    $in2,$dat2,$iv2                 // third block xox with third iv
3071         veor    $dat2,$dat2,$iv2
3072         cmp     $len,#32
3073         b.lo    .Lxts_outer_dec_tail
3074
3075         vld1.8  {$dat3},[$inp],#16
3076
3077         // The iv for fifth block
3078         extr    $midnumx,$ivh,$ivh,#32
3079         extr    $ivh,$ivh,$ivl,#63
3080         and     $tmpmw,$constnum,$midnum,asr #31
3081         eor     $ivl,$tmpmx,$ivl,lsl #1
3082         fmov    $ivd40,$ivl
3083         fmov    $ivd41,$ivh
3084
3085         vld1.8  {$dat4},[$inp],#16
3086         veor    $dat3,$dat3,$iv3                // the fourth block
3087         veor    $dat4,$dat4,$iv4
3088         sub $len,$len,#32                       // bias
3089         mov     $rounds,$rounds0
3090         b       .Loop5x_xts_dec
3091
3092 .align  4
3093 .Loop5x_xts_dec:
3094         aesd    $dat0,q8
3095         aesimc  $dat0,$dat0
3096         aesd    $dat1,q8
3097         aesimc  $dat1,$dat1
3098         aesd    $dat2,q8
3099         aesimc  $dat2,$dat2
3100         aesd    $dat3,q8
3101         aesimc  $dat3,$dat3
3102         aesd    $dat4,q8
3103         aesimc  $dat4,$dat4
3104         vld1.32 {q8},[$key_],#16                // load key schedule...
3105         subs    $rounds,$rounds,#2
3106         aesd    $dat0,q9
3107         aesimc  $dat0,$dat0
3108         aesd    $dat1,q9
3109         aesimc  $dat1,$dat1
3110         aesd    $dat2,q9
3111         aesimc  $dat2,$dat2
3112         aesd    $dat3,q9
3113         aesimc  $dat3,$dat3
3114         aesd    $dat4,q9
3115         aesimc  $dat4,$dat4
3116         vld1.32 {q9},[$key_],#16                // load key schedule...
3117         b.gt    .Loop5x_xts_dec
3118
3119         aesd    $dat0,q8
3120         aesimc  $dat0,$dat0
3121         aesd    $dat1,q8
3122         aesimc  $dat1,$dat1
3123         aesd    $dat2,q8
3124         aesimc  $dat2,$dat2
3125         aesd    $dat3,q8
3126         aesimc  $dat3,$dat3
3127         aesd    $dat4,q8
3128         aesimc  $dat4,$dat4
3129         subs    $len,$len,#0x50                 // because .Lxts_dec_tail4x
3130
3131         aesd    $dat0,q9
3132         aesimc  $dat0,$dat
3133         aesd    $dat1,q9
3134         aesimc  $dat1,$dat1
3135         aesd    $dat2,q9
3136         aesimc  $dat2,$dat2
3137         aesd    $dat3,q9
3138         aesimc  $dat3,$dat3
3139         aesd    $dat4,q9
3140         aesimc  $dat4,$dat4
3141         csel    $xoffset,xzr,$len,gt            // borrow x6, w6, "gt" is not typo
3142         mov     $key_,$key1
3143
3144         aesd    $dat0,q10
3145         aesimc  $dat0,$dat0
3146         aesd    $dat1,q10
3147         aesimc  $dat1,$dat1
3148         aesd    $dat2,q10
3149         aesimc  $dat2,$dat2
3150         aesd    $dat3,q10
3151         aesimc  $dat3,$dat3
3152         aesd    $dat4,q10
3153         aesimc  $dat4,$dat4
3154         add     $inp,$inp,$xoffset              // x0 is adjusted in such way that
3155                                                 // at exit from the loop v1.16b-v26.16b
3156                                                 // are loaded with last "words"
3157         add     $xoffset,$len,#0x60             // because .Lxts_dec_tail4x
3158
3159         aesd    $dat0,q11
3160         aesimc  $dat0,$dat0
3161         aesd    $dat1,q11
3162         aesimc  $dat1,$dat1
3163         aesd    $dat2,q11
3164         aesimc  $dat2,$dat2
3165         aesd    $dat3,q11
3166         aesimc  $dat3,$dat3
3167         aesd    $dat4,q11
3168         aesimc  $dat4,$dat4
3169
3170         aesd    $dat0,q12
3171         aesimc  $dat0,$dat0
3172         aesd    $dat1,q12
3173         aesimc  $dat1,$dat1
3174         aesd    $dat2,q12
3175         aesimc  $dat2,$dat2
3176         aesd    $dat3,q12
3177         aesimc  $dat3,$dat3
3178         aesd    $dat4,q12
3179         aesimc  $dat4,$dat4
3180
3181         aesd    $dat0,q13
3182         aesimc  $dat0,$dat0
3183         aesd    $dat1,q13
3184         aesimc  $dat1,$dat1
3185         aesd    $dat2,q13
3186         aesimc  $dat2,$dat2
3187         aesd    $dat3,q13
3188         aesimc  $dat3,$dat3
3189         aesd    $dat4,q13
3190         aesimc  $dat4,$dat4
3191
3192         aesd    $dat0,q14
3193         aesimc  $dat0,$dat0
3194         aesd    $dat1,q14
3195         aesimc  $dat1,$dat1
3196         aesd    $dat2,q14
3197         aesimc  $dat2,$dat2
3198         aesd    $dat3,q14
3199         aesimc  $dat3,$dat3
3200         aesd    $dat4,q14
3201         aesimc  $dat4,$dat4
3202
3203         veor    $tmp0,$rndlast,$iv0
3204         aesd    $dat0,q15
3205         // The iv for first block of next iteration.
3206         extr    $midnumx,$ivh,$ivh,#32
3207         extr    $ivh,$ivh,$ivl,#63
3208         and     $tmpmw,$constnum,$midnum,asr #31
3209         eor     $ivl,$tmpmx,$ivl,lsl #1
3210         fmov    $ivd00,$ivl
3211         fmov    $ivd01,$ivh
3212         veor    $tmp1,$rndlast,$iv1
3213         vld1.8  {$in0},[$inp],#16
3214         aesd    $dat1,q15
3215         // The iv for second block
3216         extr    $midnumx,$ivh,$ivh,#32
3217         extr    $ivh,$ivh,$ivl,#63
3218         and     $tmpmw,$constnum,$midnum,asr #31
3219         eor     $ivl,$tmpmx,$ivl,lsl #1
3220         fmov    $ivd10,$ivl
3221         fmov    $ivd11,$ivh
3222         veor    $tmp2,$rndlast,$iv2
3223         vld1.8  {$in1},[$inp],#16
3224         aesd    $dat2,q15
3225         // The iv for third block
3226         extr    $midnumx,$ivh,$ivh,#32
3227         extr    $ivh,$ivh,$ivl,#63
3228         and     $tmpmw,$constnum,$midnum,asr #31
3229         eor     $ivl,$tmpmx,$ivl,lsl #1
3230         fmov    $ivd20,$ivl
3231         fmov    $ivd21,$ivh
3232         veor    $tmp3,$rndlast,$iv3
3233         vld1.8  {$in2},[$inp],#16
3234         aesd    $dat3,q15
3235         // The iv for fourth block
3236         extr    $midnumx,$ivh,$ivh,#32
3237         extr    $ivh,$ivh,$ivl,#63
3238         and     $tmpmw,$constnum,$midnum,asr #31
3239         eor     $ivl,$tmpmx,$ivl,lsl #1
3240         fmov    $ivd30,$ivl
3241         fmov    $ivd31,$ivh
3242         veor    $tmp4,$rndlast,$iv4
3243         vld1.8  {$in3},[$inp],#16
3244         aesd    $dat4,q15
3245
3246         // The iv for fifth block
3247         extr    $midnumx,$ivh,$ivh,#32
3248         extr    $ivh,$ivh,$ivl,#63
3249         and     $tmpmw,$constnum,$midnum,asr #31
3250         eor     $ivl,$tmpmx,$ivl,lsl #1
3251         fmov    $ivd40,$ivl
3252         fmov    $ivd41,$ivh
3253
3254         vld1.8  {$in4},[$inp],#16
3255         cbz     $xoffset,.Lxts_dec_tail4x
3256         vld1.32 {q8},[$key_],#16                // re-pre-load rndkey[0]
3257         veor    $tmp0,$tmp0,$dat0
3258         veor    $dat0,$in0,$iv0
3259         veor    $tmp1,$tmp1,$dat1
3260         veor    $dat1,$in1,$iv1
3261         veor    $tmp2,$tmp2,$dat2
3262         veor    $dat2,$in2,$iv2
3263         veor    $tmp3,$tmp3,$dat3
3264         veor    $dat3,$in3,$iv3
3265         veor    $tmp4,$tmp4,$dat4
3266         vst1.8  {$tmp0},[$out],#16
3267         veor    $dat4,$in4,$iv4
3268         vst1.8  {$tmp1},[$out],#16
3269         mov     $rounds,$rounds0
3270         vst1.8  {$tmp2},[$out],#16
3271         vld1.32 {q9},[$key_],#16                // re-pre-load rndkey[1]
3272         vst1.8  {$tmp3},[$out],#16
3273         vst1.8  {$tmp4},[$out],#16
3274         b.hs    .Loop5x_xts_dec
3275
3276         cmn     $len,#0x10
3277         b.ne    .Loop5x_dec_after
3278         // If x2($len) equal to -0x10, the left blocks is 4.
3279         // After specially processing, utilize the five blocks processing again.
3280         // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
3281         vorr    $iv4,$iv3,$iv3
3282         vorr    $iv3,$iv2,$iv2
3283         vorr    $iv2,$iv1,$iv1
3284         vorr    $iv1,$iv0,$iv0
3285         fmov    $ivl,$ivd40
3286         fmov    $ivh,$ivd41
3287         veor    $dat0,$iv0,$in0
3288         veor    $dat1,$iv1,$in1
3289         veor    $dat2,$in2,$iv2
3290         veor    $dat3,$in3,$iv3
3291         veor    $dat4,$in4,$iv4
3292         b.eq    .Loop5x_xts_dec
3293
3294 .Loop5x_dec_after:
3295         add     $len,$len,#0x50
3296         cbz     $len,.Lxts_done
3297
3298         add     $rounds,$rounds0,#2
3299         subs    $len,$len,#0x30
3300         b.lo    .Lxts_inner_dec_tail
3301
3302         veor    $dat0,$iv0,$in2
3303         veor    $dat1,$iv1,$in3
3304         veor    $dat2,$in4,$iv2
3305         b       .Lxts_outer_dec_tail
3306
3307 .align  4
3308 .Lxts_dec_tail4x:
3309         add     $inp,$inp,#16
3310         vld1.32 {$dat0},[$inp],#16
3311         veor    $tmp1,$dat1,$tmp0
3312         vst1.8  {$tmp1},[$out],#16
3313         veor    $tmp2,$dat2,$tmp2
3314         vst1.8  {$tmp2},[$out],#16
3315         veor    $tmp3,$dat3,$tmp3
3316         veor    $tmp4,$dat4,$tmp4
3317         vst1.8  {$tmp3-$tmp4},[$out],#32
3318
3319         b       .Lxts_done
3320 .align  4
3321 .Lxts_outer_dec_tail:
3322         aesd    $dat0,q8
3323         aesimc  $dat0,$dat0
3324         aesd    $dat1,q8
3325         aesimc  $dat1,$dat1
3326         aesd    $dat2,q8
3327         aesimc  $dat2,$dat2
3328         vld1.32 {q8},[$key_],#16
3329         subs    $rounds,$rounds,#2
3330         aesd    $dat0,q9
3331         aesimc  $dat0,$dat0
3332         aesd    $dat1,q9
3333         aesimc  $dat1,$dat1
3334         aesd    $dat2,q9
3335         aesimc  $dat2,$dat2
3336         vld1.32 {q9},[$key_],#16
3337         b.gt    .Lxts_outer_dec_tail
3338
3339         aesd    $dat0,q8
3340         aesimc  $dat0,$dat0
3341         aesd    $dat1,q8
3342         aesimc  $dat1,$dat1
3343         aesd    $dat2,q8
3344         aesimc  $dat2,$dat2
3345         veor    $tmp0,$iv0,$rndlast
3346         subs    $len,$len,#0x30
3347         // The iv for first block
3348         fmov    $ivl,$ivd20
3349         fmov    $ivh,$ivd21
3350         mov     $constnum,#0x87
3351         extr    $midnumx,$ivh,$ivh,#32
3352         extr    $ivh,$ivh,$ivl,#63
3353         and     $tmpmw,$constnum,$midnum,asr #31
3354         eor     $ivl,$tmpmx,$ivl,lsl #1
3355         fmov    $ivd00,$ivl
3356         fmov    $ivd01,$ivh
3357         veor    $tmp1,$iv1,$rndlast
3358         csel    $xoffset,$len,$xoffset,lo       // x6, w6, is zero at this point
3359         aesd    $dat0,q9
3360         aesimc  $dat0,$dat0
3361         aesd    $dat1,q9
3362         aesimc  $dat1,$dat1
3363         aesd    $dat2,q9
3364         aesimc  $dat2,$dat2
3365         veor    $tmp2,$iv2,$rndlast
3366         // The iv for second block
3367         extr    $midnumx,$ivh,$ivh,#32
3368         extr    $ivh,$ivh,$ivl,#63
3369         and     $tmpmw,$constnum,$midnum,asr #31
3370         eor     $ivl,$tmpmx,$ivl,lsl #1
3371         fmov    $ivd10,$ivl
3372         fmov    $ivd11,$ivh
3373
3374         add     $xoffset,$xoffset,#0x20
3375         add     $inp,$inp,$xoffset              // $inp is adjusted to the last data
3376
3377         mov     $key_,$key1
3378
3379         // The iv for third block
3380         extr    $midnumx,$ivh,$ivh,#32
3381         extr    $ivh,$ivh,$ivl,#63
3382         and     $tmpmw,$constnum,$midnum,asr #31
3383         eor     $ivl,$tmpmx,$ivl,lsl #1
3384         fmov    $ivd20,$ivl
3385         fmov    $ivd21,$ivh
3386
3387         aesd    $dat0,q12
3388         aesimc  $dat0,$dat0
3389         aesd    $dat1,q12
3390         aesimc  $dat1,$dat1
3391         aesd    $dat2,q12
3392         aesimc  $dat2,$dat2
3393         aesd    $dat0,q13
3394         aesimc  $dat0,$dat0
3395         aesd    $dat1,q13
3396         aesimc  $dat1,$dat1
3397         aesd    $dat2,q13
3398         aesimc  $dat2,$dat2
3399         aesd    $dat0,q14
3400         aesimc  $dat0,$dat0
3401         aesd    $dat1,q14
3402         aesimc  $dat1,$dat1
3403         aesd    $dat2,q14
3404         aesimc  $dat2,$dat2
3405         vld1.8  {$in2},[$inp],#16
3406         aesd    $dat0,q15
3407         aesd    $dat1,q15
3408         aesd    $dat2,q15
3409         vld1.32 {q8},[$key_],#16                // re-pre-load rndkey[0]
3410         add     $rounds,$rounds0,#2
3411         veor    $tmp0,$tmp0,$dat0
3412         veor    $tmp1,$tmp1,$dat1
3413         veor    $dat2,$dat2,$tmp2
3414         vld1.32 {q9},[$key_],#16                // re-pre-load rndkey[1]
3415         vst1.8  {$tmp0},[$out],#16
3416         vst1.8  {$tmp1},[$out],#16
3417         vst1.8  {$dat2},[$out],#16
3418
3419         cmn     $len,#0x30
3420         add     $len,$len,#0x30
3421         b.eq    .Lxts_done
3422         sub     $len,$len,#0x30
3423         vorr    $in3,$in1,$in1
3424         vorr    $in4,$in2,$in2
3425         nop
3426
3427 .Lxts_inner_dec_tail:
3428         // $len == -0x10 means two blocks left.
3429         cmn     $len,#0x10
3430         veor    $dat1,$in3,$iv0
3431         veor    $dat2,$in4,$iv1
3432         b.eq    .Lxts_dec_tail_loop
3433         veor    $dat2,$in4,$iv0
3434 .Lxts_dec_tail_loop:
3435         aesd    $dat1,q8
3436         aesimc  $dat1,$dat1
3437         aesd    $dat2,q8
3438         aesimc  $dat2,$dat2
3439         vld1.32 {q8},[$key_],#16
3440         subs    $rounds,$rounds,#2
3441         aesd    $dat1,q9
3442         aesimc  $dat1,$dat1
3443         aesd    $dat2,q9
3444         aesimc  $dat2,$dat2
3445         vld1.32 {q9},[$key_],#16
3446         b.gt    .Lxts_dec_tail_loop
3447
3448         aesd    $dat1,q8
3449         aesimc  $dat1,$dat1
3450         aesd    $dat2,q8
3451         aesimc  $dat2,$dat2
3452         aesd    $dat1,q9
3453         aesimc  $dat1,$dat1
3454         aesd    $dat2,q9
3455         aesimc  $dat2,$dat2
3456         aesd    $dat1,q12
3457         aesimc  $dat1,$dat1
3458         aesd    $dat2,q12
3459         aesimc  $dat2,$dat2
3460         cmn     $len,#0x20
3461         aesd    $dat1,q13
3462         aesimc  $dat1,$dat1
3463         aesd    $dat2,q13
3464         aesimc  $dat2,$dat2
3465         veor    $tmp1,$iv0,$rndlast
3466         aesd    $dat1,q14
3467         aesimc  $dat1,$dat1
3468         aesd    $dat2,q14
3469         aesimc  $dat2,$dat2
3470         veor    $tmp2,$iv1,$rndlast
3471         aesd    $dat1,q15
3472         aesd    $dat2,q15
3473         b.eq    .Lxts_dec_one
3474         veor    $tmp1,$tmp1,$dat1
3475         veor    $tmp2,$tmp2,$dat2
3476         vorr    $iv0,$iv2,$iv2
3477         vorr    $iv1,$iv3,$iv3
3478         vst1.8  {$tmp1},[$out],#16
3479         vst1.8  {$tmp2},[$out],#16
3480         add     $len,$len,#16
3481         b       .Lxts_done
3482
3483 .Lxts_dec_one:
3484         veor    $tmp1,$tmp1,$dat2
3485         vorr    $iv0,$iv1,$iv1
3486         vorr    $iv1,$iv2,$iv2
3487         vst1.8  {$tmp1},[$out],#16
3488         add     $len,$len,#32
3489
3490 .Lxts_done:
3491         tst     $tailcnt,#0xf
3492         b.eq    .Lxts_dec_abort
3493         // Processing the last two blocks with cipher stealing.
3494         mov     x7,x3
3495         cbnz    x2,.Lxts_dec_1st_done
3496         vld1.32 {$dat0},[$inp],#16
3497
3498         // Decrypt the last second block to get the last plain text block
3499 .Lxts_dec_1st_done:
3500         eor     $tmpin,$dat0,$iv1
3501         ldr     $rounds,[$key1,#240]
3502         vld1.32 {$dat0},[$key1],#16
3503         sub     $rounds,$rounds,#2
3504         vld1.32 {$dat1},[$key1],#16
3505 .Loop_final_2nd_dec:
3506         aesd    $tmpin,$dat0
3507         aesimc  $tmpin,$tmpin
3508         vld1.32 {$dat0},[$key1],#16             // load key schedule...
3509         subs    $rounds,$rounds,#2
3510         aesd    $tmpin,$dat1
3511         aesimc  $tmpin,$tmpin
3512         vld1.32 {$dat1},[$key1],#16             // load key schedule...
3513         b.gt    .Loop_final_2nd_dec
3514
3515         aesd    $tmpin,$dat0
3516         aesimc  $tmpin,$tmpin
3517         vld1.32 {$dat0},[$key1]
3518         aesd    $tmpin,$dat1
3519         veor    $tmpin,$tmpin,$dat0
3520         veor    $tmpin,$tmpin,$iv1
3521         vst1.8  {$tmpin},[$out]
3522
3523         mov     $tmpinp,$inp
3524         add     $tmpoutp,$out,#16
3525
3526         // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
3527         // to get the last encrypted block.
3528 .composite_dec_loop:
3529         subs    $tailcnt,$tailcnt,#1
3530         ldrb    $l2outp,[$out,$tailcnt]
3531         ldrb    $loutp,[$tmpinp,$tailcnt]
3532         strb    $l2outp,[$tmpoutp,$tailcnt]
3533         strb    $loutp,[$out,$tailcnt]
3534         b.gt    .composite_dec_loop
3535 .Lxts_dec_load_done:
3536         vld1.8  {$tmpin},[$out]
3537         veor    $tmpin,$tmpin,$iv0
3538
3539         // Decrypt the composite block to get the last second plain text block
3540         ldr     $rounds,[$key_,#240]
3541         vld1.8  {$dat},[$key_],#16
3542         sub     $rounds,$rounds,#2
3543         vld1.8  {$dat1},[$key_],#16
3544 .Loop_final_dec:
3545         aesd    $tmpin,$dat0
3546         aesimc  $tmpin,$tmpin
3547         vld1.32 {$dat0},[$key_],#16             // load key schedule...
3548         subs    $rounds,$rounds,#2
3549         aesd    $tmpin,$dat1
3550         aesimc  $tmpin,$tmpin
3551         vld1.32 {$dat1},[$key_],#16             // load key schedule...
3552         b.gt    .Loop_final_dec
3553
3554         aesd    $tmpin,$dat0
3555         aesimc  $tmpin,$tmpin
3556         vld1.32 {$dat0},[$key_]
3557         aesd    $tmpin,$dat1
3558         veor    $tmpin,$tmpin,$dat0
3559         veor    $tmpin,$tmpin,$iv0
3560         vst1.8  {$tmpin},[$out]
3561
3562 .Lxts_dec_abort:
3563         ldp     $tailcnt,$midnumx,[sp,#48]
3564         ldp     $ivd10,$ivd20,[sp,#32]
3565         ldp     $ivd30,$ivd40,[sp,#16]
3566         ldp     $constnumx,$tmpinp,[sp],#64
3567
3568 .Lxts_dec_final_abort:
3569         ret
3570 .size   ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
3571 ___
3572 }
3573 }}}
3574 $code.=<<___;
3575 #endif
3576 ___
3577 ########################################
3578 if ($flavour =~ /64/) {                 ######## 64-bit code
3579     my %opcode = (
3580         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
3581         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
3582
3583     local *unaes = sub {
3584         my ($mnemonic,$arg)=@_;
3585
3586         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
3587         sprintf ".inst\t0x%08x\t//%s %s",
3588                         $opcode{$mnemonic}|$1|($2<<5),
3589                         $mnemonic,$arg;
3590     };
3591
3592     foreach(split("\n",$code)) {
3593         s/\`([^\`]*)\`/eval($1)/geo;
3594
3595         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
3596         s/@\s/\/\//o;                   # old->new style commentary
3597
3598         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
3599         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
3600         s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
3601         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
3602         s/vext\.8/ext/o         or
3603         s/vrev32\.8/rev32/o     or
3604         s/vtst\.8/cmtst/o       or
3605         s/vshr/ushr/o           or
3606         s/^(\s+)v/$1/o          or      # strip off v prefix
3607         s/\bbx\s+lr\b/ret/o;
3608
3609         # fix up remaining legacy suffixes
3610         s/\.[ui]?8//o;
3611         m/\],#8/o and s/\.16b/\.8b/go;
3612         s/\.[ui]?32//o and s/\.16b/\.4s/go;
3613         s/\.[ui]?64//o and s/\.16b/\.2d/go;
3614         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
3615
3616         # Switch preprocessor checks to aarch64 versions.
3617         s/__ARME([BL])__/__AARCH64E$1__/go;
3618
3619         print $_,"\n";
3620     }
3621 } else {                                ######## 32-bit code
3622     my %opcode = (
3623         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
3624         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
3625
3626     local *unaes = sub {
3627         my ($mnemonic,$arg)=@_;
3628
3629         if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
3630             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
3631                                          |(($2&7)<<1) |(($2&8)<<2);
3632             # since ARMv7 instructions are always encoded little-endian.
3633             # correct solution is to use .inst directive, but older
3634             # assemblers don't implement it:-(
3635             sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
3636                         $word&0xff,($word>>8)&0xff,
3637                         ($word>>16)&0xff,($word>>24)&0xff,
3638                         $mnemonic,$arg;
3639         }
3640     };
3641
3642     sub unvtbl {
3643         my $arg=shift;
3644
3645         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
3646         sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
3647                 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
3648     }
3649
3650     sub unvdup32 {
3651         my $arg=shift;
3652
3653         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
3654         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
3655     }
3656
3657     sub unvmov32 {
3658         my $arg=shift;
3659
3660         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
3661         sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
3662     }
3663
3664     foreach(split("\n",$code)) {
3665         s/\`([^\`]*)\`/eval($1)/geo;
3666
3667         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
3668         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
3669         s/\/\/\s?/@ /o;                         # new->old style commentary
3670
3671         # fix up remaining new-style suffixes
3672         s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
3673         s/\],#[0-9]+/]!/o;
3674
3675         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
3676         s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2     $1,#0/o or
3677         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
3678         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
3679         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
3680         s/^(\s+)b\./$1b/o                               or
3681         s/^(\s+)ret/$1bx\tlr/o;
3682
3683         if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
3684             print "     it      $2\n";
3685         }
3686
3687         print $_,"\n";
3688     }
3689 }
3690
3691 close STDOUT or die "error closing STDOUT: $!";