]> git.ipfire.org Git - thirdparty/openssl.git/commitdiff
Implement AES-CBC-HMAC-SHA512 on aarch64
authorfangming.fang <fangming.fang@arm.com>
Fri, 26 Jan 2024 10:48:17 +0000 (10:48 +0000)
committerMatt Caswell <matt@openssl.org>
Mon, 14 Apr 2025 13:53:30 +0000 (14:53 +0100)
This is to implement #19932, it adds enc-then-mac aes-cbc-hmac-sha512 on
aarch64, aes-cbc and hmac-sha512 are interleaved to achieve better
performance.It only supports non-padding mode that means the length of
input data should be multiple of 16 bytes.

Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com>
(Merged from https://github.com/openssl/openssl/pull/22949)

21 files changed:
crypto/aes/asm/aes-sha512-armv8.pl [new file with mode: 0644]
crypto/aes/build.info
crypto/objects/obj_dat.h
crypto/objects/obj_mac.num
crypto/objects/objects.txt
include/crypto/aes_platform.h
include/openssl/obj_mac.h
providers/common/include/prov/providercommon.h
providers/defltprov.c
providers/fips/fipsprov.c
providers/implementations/ciphers/build.info
providers/implementations/ciphers/cipher_aes_cbc_hmac_sha1_etm_hw.c
providers/implementations/ciphers/cipher_aes_cbc_hmac_sha256_etm_hw.c
providers/implementations/ciphers/cipher_aes_cbc_hmac_sha512_etm_hw.c [new file with mode: 0644]
providers/implementations/ciphers/cipher_aes_cbc_hmac_sha_etm.c
providers/implementations/ciphers/cipher_aes_cbc_hmac_sha_etm.h
providers/implementations/include/prov/implementations.h
providers/implementations/include/prov/names.h
test/evp_libctx_test.c
test/evp_test.c
test/recipes/30-test_evp_data/evpciph_aes_stitched.txt

diff --git a/crypto/aes/asm/aes-sha512-armv8.pl b/crypto/aes/asm/aes-sha512-armv8.pl
new file mode 100644 (file)
index 0000000..f51ce83
--- /dev/null
@@ -0,0 +1,2967 @@
+#! /usr/bin/env perl
+
+# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+$code=<<___;
+#include "arm_arch.h"
+
+# Theses are offsets into the CIPH_DIGEST struct
+#define CIPHER_KEY     0
+#define CIPHER_KEY_ROUNDS      8
+#define CIPHER_IV      16
+#define HMAC_IKEYPAD   24
+#define HMAC_OKEYPAD   32
+
+.text
+.arch armv8-a+crypto
+___
+
+sub aes_block_9_rounds() {
+       my $i = shift;
+$code.=<<___;
+       /* aes block $i */
+       aese            v$i.16b, v8.16b
+       aesmc           v$i.16b, v$i.16b
+       aese            v$i.16b, v9.16b
+       aesmc           v$i.16b, v$i.16b
+       aese            v$i.16b, v10.16b
+       aesmc           v$i.16b, v$i.16b
+       aese            v$i.16b, v11.16b
+       aesmc           v$i.16b, v$i.16b
+       aese            v$i.16b, v12.16b
+       aesmc           v$i.16b, v$i.16b
+       aese            v$i.16b, v13.16b
+       aesmc           v$i.16b, v$i.16b
+       aese            v$i.16b, v14.16b
+       aesmc           v$i.16b, v$i.16b
+       aese            v$i.16b, v15.16b
+       aesmc           v$i.16b, v$i.16b
+       aese            v$i.16b, v16.16b
+       aesmc           v$i.16b, v$i.16b
+___
+}
+
+sub aes_block_last_rounds () {
+       my $compare = shift;
+       my $label = shift;
+       my $i = shift;
+       my $load_rk10 = shift;
+
+       if($compare == 1) {
+$code.=<<___;
+       cmp             x9, #12         /* tell 128,192,256 apart */
+___
+       }
+$code.=<<___;
+       b.lt            .Laes128_${label}_$i
+.Laes192_${label}_$i:
+       ldp             q18,q19,[x7],32         /* rk[10],rk[11] */
+       aese            v$i.16b,v17.16b
+       aesmc           v$i.16b,v$i.16b
+       aese            v$i.16b,v18.16b
+       aesmc           v$i.16b,v$i.16b
+       b.gt            .Laes256_${label}_$i
+       ld1             {v18.16b},[x7]          /* rk[12] */
+       aese            v$i.16b,v19.16b
+       eor             v$i.16b,v$i.16b,v18.16b
+       sub             x7, x7, #32             /* rewind x7 */
+       b               1f
+.Laes256_${label}_$i:
+       aese            v$i.16b,v19.16b
+       aesmc           v$i.16b,v$i.16b
+       ldp             q18,q19,[x7],32         /* rk[12],rk[13] */
+       aese            v$i.16b,v18.16b
+       aesmc           v$i.16b,v$i.16b
+       ld1             {v18.16b},[x7]          /* rk[14] */
+       aese            v$i.16b,v19.16b
+       eor             v$i.16b,v$i.16b,v18.16b
+       sub             x7, x7, #64             /* rewind x7 */
+       b               1f
+.Laes128_${label}_$i:
+___
+       if ($load_rk10 == 1) {
+$code.=<<___;
+       ld1             {v18.16b},[x7]
+___
+       }
+$code.=<<___;
+       aese            v$i.16b,v17.16b
+       eor             v$i.16b,v$i.16b,v18.16b /* res */
+1:
+___
+}
+
+sub aes_block_dec_9_rounds() {
+       my $i = shift;
+$code.=<<___;
+       /* aes block $i */
+       aesd            v$i.16b, v8.16b
+       aesimc          v$i.16b, v$i.16b
+       aesd            v$i.16b, v9.16b
+       aesimc          v$i.16b, v$i.16b
+       aesd            v$i.16b, v10.16b
+       aesimc          v$i.16b, v$i.16b
+       aesd            v$i.16b, v11.16b
+       aesimc          v$i.16b, v$i.16b
+       aesd            v$i.16b, v12.16b
+       aesimc          v$i.16b, v$i.16b
+       aesd            v$i.16b, v13.16b
+       aesimc          v$i.16b, v$i.16b
+       aesd            v$i.16b, v14.16b
+       aesimc          v$i.16b, v$i.16b
+       aesd            v$i.16b, v15.16b
+       aesimc          v$i.16b, v$i.16b
+       aesd            v$i.16b, v16.16b
+       aesimc          v$i.16b, v$i.16b
+___
+}
+
+sub aes_block_dec_last_rounds () {
+       my $compare = shift;
+       my $label = shift;
+       my $i = shift;
+       my $load_rk10 = shift;
+
+       if($compare == 1) {
+$code.=<<___;
+       cmp             x9, #12                 /* tell 128,192,256 apart */
+___
+       }
+$code.=<<___;
+       b.lt            .Laes128_${label}_$i
+.Laes192_${label}_$i:
+       ldp             q18,q19,[x7],32                 /* rk[10],rk[11] */
+       aesd            v$i.16b,v17.16b
+       aesimc          v$i.16b,v$i.16b
+       aesd            v$i.16b,v18.16b
+       aesimc          v$i.16b,v$i.16b
+       b.gt            .Laes256_${label}_$i
+       ld1             {v18.16b},[x7]                  /* rk[12] */
+       aesd            v$i.16b,v19.16b
+       eor             v$i.16b,v$i.16b,v18.16b
+       sub             x7, x7, #32                     /* rewind x7 */
+       b               1f
+.Laes256_${label}_$i:
+       aesd            v$i.16b,v19.16b
+       aesimc          v$i.16b,v$i.16b
+       ldp             q18,q19,[x7],32                 /* rk[12],rk[13] */
+       aesd            v$i.16b,v18.16b
+       aesimc          v$i.16b,v$i.16b
+       ld1             {v18.16b},[x7]                  /* rk[14] */
+       aesd            v$i.16b,v19.16b
+       eor             v$i.16b,v$i.16b,v18.16b
+       sub             x7, x7, #64                     /* rewind x7 */
+       b               1f
+.Laes128_${label}_$i:
+___
+       if ($load_rk10 == 1) {
+$code.=<<___;
+       ld1     {v18.16b},[x7]
+___
+       }
+$code.=<<___;
+       aesd            v$i.16b,v17.16b
+       eor             v$i.16b,v$i.16b,v18.16b         /* res */
+1:
+___
+}
+
+sub sha512_block() {
+       my @H = map("v$_",(24..28));
+       my @QH = map("q$_",(24..28));
+       my ($FG, $DE) = map("v$_",(29..30));
+       my ($QFG, $QDE) = map("q$_",(29..30));
+       my $M9_10 = "v31";
+       my @MSG = map("v$_", (0..7));
+       my ($W0, $W1) = ("v8", "v9");
+       my ($AB, $CD, $EF, $GH) = map("v$_",(20..23));
+       my $need_revert = shift;
+
+       if($need_revert == 1) {
+$code.=<<___;
+       rev64           @MSG[0].16b, @MSG[0].16b
+       rev64           @MSG[1].16b, @MSG[1].16b
+       rev64           @MSG[2].16b, @MSG[2].16b
+       rev64           @MSG[3].16b, @MSG[3].16b
+       rev64           @MSG[4].16b, @MSG[4].16b
+       rev64           @MSG[5].16b, @MSG[5].16b
+       rev64           @MSG[6].16b, @MSG[6].16b
+       rev64           @MSG[7].16b, @MSG[7].16b
+___
+       }
+$code.=<<___;
+       /* load const k */
+       ld1             {$W0.2d}, [x10], #16
+
+       /* backup ABCDEFGH */
+       mov             $AB.16b, @H[0].16b
+       mov             $CD.16b, @H[1].16b
+       mov             $EF.16b, @H[2].16b
+       mov             $GH.16b, @H[3].16b
+___
+for($i = 0; $i < 32; $i++) {
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16)*/
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+}
+for(;$i<40;$i++) {
+$code.=<<___   if ($i<39);
+       ld1             {$W1.2d},[x10],#16
+___
+$code.=<<___   if ($i==39);
+       sub             x10, x10, #80*8 // rewind
+___
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+}
+$code.=<<___;
+       add             @H[0].2d, @H[0].2d, $AB.2d
+       add             @H[1].2d, @H[1].2d, $CD.2d
+       add             @H[2].2d, @H[2].2d, $EF.2d
+       add             @H[3].2d, @H[3].2d, $GH.2d
+___
+}
+
+{
+       my @H = map("v$_",(24..28));
+       my @QH = map("q$_",(24..28));
+       my ($FG, $DE) = map("v$_",(29..30));
+       my ($QFG, $QDE) = map("q$_",(29..30));
+       my $M9_10 = "v31";
+       my @MSG = map("v$_", (0..7));
+       my ($W0, $W1) = ("v14", "v15");
+       my ($AB, $CD, $EF, $GH) = map("v$_",(20..23));
+
+$code.=<<___;
+/*
+ * asm_aescbc_sha512_hmac(
+ *     csrc,   x0      (cipher src address)
+ *     cdst,   x1      (cipher dst address)
+ *     clen    x2      (cipher length)
+ *     dsrc,   x3      (digest src address)
+ *     ddst,   x4      (digest dst address)
+ *     dlen,   x5      (digest length)
+ *     arg     x6      :
+ *             arg->cipher.key                 (round keys)
+ *             arg->cipher.key_rounds          (key rounds)
+ *             arg->cipher.iv                  (initialization vector)
+ *             arg->digest.hmac.i_key_pad      (partially hashed i_key_pad)
+ *             arg->digest.hmac.o_key_pad      (partially hashed o_key_pad)
+ *     )
+ */
+
+.global asm_aescbc_sha512_hmac
+.type  asm_aescbc_sha512_hmac,%function
+
+.align 6
+.LK512:
+       .quad   0x428a2f98d728ae22,0x7137449123ef65cd
+       .quad   0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+       .quad   0x3956c25bf348b538,0x59f111f1b605d019
+       .quad   0x923f82a4af194f9b,0xab1c5ed5da6d8118
+       .quad   0xd807aa98a3030242,0x12835b0145706fbe
+       .quad   0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+       .quad   0x72be5d74f27b896f,0x80deb1fe3b1696b1
+       .quad   0x9bdc06a725c71235,0xc19bf174cf692694
+       .quad   0xe49b69c19ef14ad2,0xefbe4786384f25e3
+       .quad   0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+       .quad   0x2de92c6f592b0275,0x4a7484aa6ea6e483
+       .quad   0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+       .quad   0x983e5152ee66dfab,0xa831c66d2db43210
+       .quad   0xb00327c898fb213f,0xbf597fc7beef0ee4
+       .quad   0xc6e00bf33da88fc2,0xd5a79147930aa725
+       .quad   0x06ca6351e003826f,0x142929670a0e6e70
+       .quad   0x27b70a8546d22ffc,0x2e1b21385c26c926
+       .quad   0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+       .quad   0x650a73548baf63de,0x766a0abb3c77b2a8
+       .quad   0x81c2c92e47edaee6,0x92722c851482353b
+       .quad   0xa2bfe8a14cf10364,0xa81a664bbc423001
+       .quad   0xc24b8b70d0f89791,0xc76c51a30654be30
+       .quad   0xd192e819d6ef5218,0xd69906245565a910
+       .quad   0xf40e35855771202a,0x106aa07032bbd1b8
+       .quad   0x19a4c116b8d2d0c8,0x1e376c085141ab53
+       .quad   0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+       .quad   0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+       .quad   0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+       .quad   0x748f82ee5defb2fc,0x78a5636f43172f60
+       .quad   0x84c87814a1f0ab72,0x8cc702081a6439ec
+       .quad   0x90befffa23631e28,0xa4506cebde82bde9
+       .quad   0xbef9a3f7b2c67915,0xc67178f2e372532b
+       .quad   0xca273eceea26619c,0xd186b8c721c0c207
+       .quad   0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+       .quad   0x06f067aa72176fba,0x0a637dc5a2c898a6
+       .quad   0x113f9804bef90dae,0x1b710b35131c471b
+       .quad   0x28db77f523047d84,0x32caab7b40c72493
+       .quad   0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+       .quad   0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+       .quad   0x5fcb6fab3ad6faec,0x6c44198c4a475817
+       .quad   0       // terminator
+
+       .align  4
+asm_aescbc_sha512_hmac:
+       AARCH64_VALID_CALL_TARGET
+       /* save callee save register */
+       stp             d8, d9, [sp,#-64]!
+       stp             d10, d11, [sp,#16]
+       stp             d12, d13, [sp,#32]
+       stp             d14, d15, [sp,#48]
+
+       /* load ABCDEFGH */
+       ldr             x7, [x6, #HMAC_IKEYPAD]
+       ld1             {v24.2d, v25.2d, v26.2d, v27.2d}, [x7]
+
+       ldr             x7, [x6, #CIPHER_KEY]
+       ldr             x8, [x6, #CIPHER_IV]
+       ldr             x9, [x6, #CIPHER_KEY_ROUNDS]
+       mov             x12, x7                         /* backup x7 */
+
+       adr             x10, .LK512
+
+       lsr             x11, x2, #4                     /* aes_block = len/16 */
+       cbz             x11, .Lret                      /* return if aes_block = 0 */
+
+       cmp             x11, #16
+       b.lt            .Lenc_short_case
+
+       ld1             {v0.16b}, [x0], #16             /* load plaintext */
+       ld1             {v1.16b}, [x8]                  /* load iv */
+
+       eor             v0.16b, v0.16b, v1.16b  /* iv xor plaintext */
+
+       ldp             q8, q9, [x7], #32               /* rk0, rk1 */
+       /* block 0 */
+       aese            v0.16b, v8.16b
+       aesmc           v0.16b, v0.16b
+       ldp             q10, q11, [x7], #32             /* rk2, rk3 */
+       aese            v0.16b, v9.16b
+       aesmc           v0.16b, v0.16b
+       aese            v0.16b, v10.16b
+       aesmc           v0.16b, v0.16b
+       ldp             q12, q13, [x7], #32             /* rk4, rk5 */
+       aese            v0.16b, v11.16b
+       aesmc           v0.16b, v0.16b
+       aese            v0.16b, v12.16b
+       aesmc           v0.16b, v0.16b
+       ldp             q14, q15, [x7], #32             /* rk6, rk7 */
+       aese            v0.16b, v13.16b
+       aesmc           v0.16b, v0.16b
+       aese            v0.16b, v14.16b
+       aesmc           v0.16b, v0.16b
+       ldp             q16, q17, [x7], #32             /* rk8, rk9 */
+       aese            v0.16b, v15.16b
+       aesmc           v0.16b, v0.16b
+       aese            v0.16b, v16.16b
+       aesmc           v0.16b, v0.16b
+       ld1             {v18.16b}, [x7]                 /* rk10 */
+___
+&aes_block_last_rounds(1, "enc_prelog", 0, 0);
+$code.=<<___;
+       str             q0, [x1], #16                   /* store cipher result */
+       ld1             {v1.16b}, [x0], #16             /* load next block */
+       eor             v1.16b, v1.16b, v0.16b          /* output xor block */
+___
+# process aes blocks from 1 to 7
+for($i = 1; $i < 8; $i = $i + 1) {
+       &aes_block_9_rounds($i);
+       &aes_block_last_rounds(0, "enc_prelog", $i, 0);
+       if($i != 7) {
+               $next = $i + 1;
+$code.=<<___;
+       /* load next block */
+       ld1             {v$next.16b}, [x0], #16
+       /* output xor block */
+       eor             v$next.16b, v$next.16b, v$i.16b
+___
+       }
+$code.=<<___;
+       str             q$i, [x1], #16                          /* store cipher result */
+___
+}
+$code.=<<___;
+       sub             x11, x11, #8
+
+.Lenc_main_loop:
+       mov             x7, x12
+       mov             x14, x1
+       /* aes block 0 */
+       ldp             q8, q9, [x7], #32                       /* rk0, rk1 */
+       ldp             q10, q11, [x7], #32                     /* rk2, rk3 */
+
+       ld1             {v12.16b}, [x0], #16
+       eor             v12.16b, v12.16b, v7.16b
+
+       /* reverse message */
+       rev64           @MSG[0].16b, @MSG[0].16b
+       rev64           @MSG[1].16b, @MSG[1].16b
+       rev64           @MSG[2].16b, @MSG[2].16b
+       rev64           @MSG[3].16b, @MSG[3].16b
+       rev64           @MSG[4].16b, @MSG[4].16b
+       rev64           @MSG[5].16b, @MSG[5].16b
+       rev64           @MSG[6].16b, @MSG[6].16b
+       rev64           @MSG[7].16b, @MSG[7].16b
+       ld1             {$W0.2d}, [x10], #16                    /* load const k*/
+
+       /* backup ABCDEFGH */
+       mov             $AB.16b, @H[0].16b
+       mov             $CD.16b, @H[1].16b
+       mov             $EF.16b, @H[2].16b
+       mov             $GH.16b, @H[3].16b
+
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       ldp             q8, q9, [x7], #32                       /* rk4, rk5 */
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       ldp             q10, q11, [x7], #32                     /* rk6, rk7 */
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       ldp             q8, q9, [x7], #32                       /* rk8, rk9 */
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       cmp             x9, #12
+       b.lt            .Lenc_main_loop_aes128_0
+.Lenc_main_loop_aes192_0:
+       ldp             q10, q11, [x7], #32                     /* rk10, rk11 */
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       b.gt            .Lenc_main_loop_aes256_0
+       ld1             {v8.16b},[x7]                           /* rk12 */
+       aese            v12.16b, v11.16b
+       eor             v12.16b, v12.16b, v8.16b
+       b               1f
+.Lenc_main_loop_aes256_0:
+       ldp             q8, q9, [x7], #32                       /* rk12, rk13 */
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       ld1             {v10.16b},[x7]                          /* rk14 */
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       aese            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+       b               1f
+.Lenc_main_loop_aes128_0:
+       ld1             {v10.16b},[x7]                          /* rk10 */
+       aese            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+1:
+       st1             {v12.16b}, [x1], #16
+       /* aes block 1 */
+       mov             x7, x12
+       ldp             q8, q9, [x7], #32                       /* rk0, rk1 */
+       ldp             q10, q11, [x7], #32                     /* rk2, rk3 */
+
+       ld1             {v13.16b}, [x0], #16
+       eor             v12.16b, v12.16b, v13.16b
+
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       ldp             q8, q9, [x7], #32                       /* rk4, rk5 */
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       ldp             q10, q11, [x7], #32                     /* rk6, rk7 */
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       ldp             q8, q9, [x7], #32                       /* rk8, rk9 */
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       cmp             x9, #12
+       b.lt            .Lenc_main_loop_aes128_1
+.Lenc_main_loop_aes192_1:
+       ldp             q10, q11, [x7], #32                     /* rk10, rk11 */
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       b.gt            .Lenc_main_loop_aes256_1
+       ld1             {v8.16b},[x7]                           /* rk12 */
+       aese            v12.16b, v11.16b
+       eor             v12.16b, v12.16b, v8.16b
+       b               1f
+.Lenc_main_loop_aes256_1:
+       ldp             q8, q9, [x7], #32                       /* rk12, rk13 */
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       ld1             {v10.16b},[x7]                          /* rk14 */
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       aese            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+       b               1f
+.Lenc_main_loop_aes128_1:
+       ld1             {v10.16b},[x7]                          /* rk10 */
+       aese            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+1:
+       st1             {v12.16b}, [x1], #16
+       /* aes block 2 */
+       mov             x7, x12
+       ldp             q8, q9, [x7], #32                       /* rk0, rk1 */
+       ldp             q10, q11, [x7], #32                     /* rk2, rk3 */
+
+       ld1             {v13.16b}, [x0], #16
+       eor             v12.16b, v12.16b, v13.16b
+
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       ldp             q8, q9, [x7], #32                       /* rk4, rk5 */
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       ldp             q10, q11, [x7], #32                     /* rk6, rk7 */
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       ldp             q8, q9, [x7], #32                       /* rk8, rk9 */
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       cmp             x9, #12
+       b.lt            .Lenc_main_loop_aes128_2
+.Lenc_main_loop_aes192_2:
+       ldp             q10, q11, [x7], #32                     /* rk10, rk11 */
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       b.gt            .Lenc_main_loop_aes256_2
+       ld1             {v8.16b},[x7]                           /* rk12 */
+       aese            v12.16b, v11.16b
+       eor             v12.16b, v12.16b, v8.16b
+       b               1f
+.Lenc_main_loop_aes256_2:
+       ldp             q8, q9, [x7], #32                       /* rk12, rk13 */
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       ld1             {v10.16b},[x7]                          /* rk14 */
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       aese            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+       b               1f
+.Lenc_main_loop_aes128_2:
+       ld1             {v10.16b},[x7]                          /* rk10 */
+       aese            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+1:
+       st1             {v12.16b}, [x1], #16
+       /* aes block 3 */
+       mov             x7, x12
+       ldp             q8, q9, [x7], #32                       /* rk0, rk1 */
+       ldp             q10, q11, [x7], #32                     /* rk2, rk3 */
+
+       ld1             {v13.16b}, [x0], #16
+       eor             v12.16b, v12.16b, v13.16b
+
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       ldp             q8, q9, [x7], #32                       /* rk4, rk5 */
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       ldp             q10, q11, [x7], #32                     /* rk6, rk7 */
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       ldp             q8, q9, [x7], #32                       /* rk8, rk9 */
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       cmp             x9, #12
+       b.lt            .Lenc_main_loop_aes128_3
+.Lenc_main_loop_aes192_3:
+       ldp             q10, q11, [x7], #32                     /* rk10, rk11 */
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       b.gt            .Lenc_main_loop_aes256_3
+       ld1             {v8.16b},[x7]                           /* rk12 */
+       aese            v12.16b, v11.16b
+       eor             v12.16b, v12.16b, v8.16b
+       b               1f
+.Lenc_main_loop_aes256_3:
+       ldp             q8, q9, [x7], #32                       /* rk12, rk13 */
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       ld1             {v10.16b},[x7]                          /* rk14 */
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       aese            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+       b               1f
+.Lenc_main_loop_aes128_3:
+       ld1             {v10.16b},[x7]                          /* rk10 */
+       aese            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+1:
+       st1             {v12.16b}, [x1], #16
+       /* aes block 4 */
+       mov             x7, x12
+       ldp             q8, q9, [x7], #32                       /* rk0, rk1 */
+       ldp             q10, q11, [x7], #32                     /* rk2, rk3 */
+
+       ld1             {v13.16b}, [x0], #16
+       eor             v12.16b, v12.16b, v13.16b
+
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       ldp             q8, q9, [x7], #32                       /* rk4, rk5 */
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       ldp             q10, q11, [x7], #32                     /* rk6, rk7 */
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       ldp             q8, q9, [x7], #32                       /* rk8, rk9 */
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       cmp             x9, #12
+       b.lt            .Lenc_main_loop_aes128_4
+.Lenc_main_loop_aes192_4:
+       ldp             q10, q11, [x7], #32                     /* rk10, rk11 */
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       b.gt            .Lenc_main_loop_aes256_4
+       ld1             {v8.16b},[x7]                           /* rk12 */
+       aese            v12.16b, v11.16b
+       eor             v12.16b, v12.16b, v8.16b
+       b               1f
+.Lenc_main_loop_aes256_4:
+       ldp             q8, q9, [x7], #32                       /* rk12, rk13 */
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       ld1             {v10.16b},[x7]                          /* rk14 */
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       aese            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+       b               1f
+.Lenc_main_loop_aes128_4:
+       ld1             {v10.16b},[x7]                          /* rk10 */
+       aese            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+1:
+       st1             {v12.16b}, [x1], #16
+       /* aes block 5 */
+       mov             x7, x12
+       ldp             q8, q9, [x7], #32                       /* rk0, rk1 */
+       ldp             q10, q11, [x7], #32                     /* rk2, rk3 */
+
+       ld1             {v13.16b}, [x0], #16
+       eor             v12.16b, v12.16b, v13.16b
+
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       ldp             q8, q9, [x7], #32                       /* rk4, rk5 */
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       ldp             q10, q11, [x7], #32                     /* rk6, rk7 */
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       ldp             q8, q9, [x7], #32                       /* rk8, rk9 */
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       cmp             x9, #12
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       b.lt            .Lenc_main_loop_aes128_5
+.Lenc_main_loop_aes192_5:
+       ldp             q10, q11, [x7], #32                     /* rk10, rk11 */
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       b.gt            .Lenc_main_loop_aes256_5
+       ld1             {v8.16b},[x7]                           /* rk12 */
+       aese            v12.16b, v11.16b
+       eor             v12.16b, v12.16b, v8.16b
+       b               1f
+.Lenc_main_loop_aes256_5:
+       ldp             q8, q9, [x7], #32                       /* rk12, rk13 */
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       ld1             {v10.16b},[x7]                          /* rk14 */
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       aese            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+       b               1f
+.Lenc_main_loop_aes128_5:
+       ld1             {v10.16b},[x7]                          /* rk10 */
+       aese            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+1:
+       st1             {v12.16b}, [x1], #16
+       /* aes block 6 */
+       mov             x7, x12
+       ldp             q8, q9, [x7], #32                       /* rk0, rk1 */
+       ldp             q10, q11, [x7], #32                     /* rk2, rk3 */
+
+       ld1             {v13.16b}, [x0], #16
+       eor             v12.16b, v12.16b, v13.16b
+
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ld1             {$W1.2d}, [x10], #16                    /* load const k*/
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       ext             $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       ldp             q8, q9, [x7], #32                       /* rk4, rk5 */
+       /* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
+       sha512su0       @MSG[0].2d, @MSG[1].2d
+       /* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
+       sha512su1       @MSG[0].2d, @MSG[7].2d, $M9_10.2d
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       ld1             {$W1.2d},[x10],#16
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       ldp             q10, q11, [x7], #32                     /* rk6, rk7 */
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       ld1             {$W1.2d},[x10],#16
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       ldp             q8, q9, [x7], #32                       /* rk8, rk9 */
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       ld1             {$W1.2d},[x10],#16
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       cmp             x9, #12
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       b.lt            .Lenc_main_loop_aes128_6
+.Lenc_main_loop_aes192_6:
+       ldp             q10, q11, [x7], #32                     /* rk10, rk11 */
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       b.gt            .Lenc_main_loop_aes256_6
+       ld1             {v8.16b},[x7]                           /* rk12 */
+       aese            v12.16b, v11.16b
+       eor             v12.16b, v12.16b, v8.16b
+       b               1f
+.Lenc_main_loop_aes256_6:
+       ldp             q8, q9, [x7], #32                       /* rk12, rk13 */
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       ld1             {v10.16b},[x7]                          /* rk14 */
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       aese            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+       b               1f
+.Lenc_main_loop_aes128_6:
+       ld1             {v10.16b},[x7]                          /* rk10 */
+       aese            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+1:
+       st1             {v12.16b}, [x1], #16
+       /* aes block 7 */
+       mov             x7, x12
+       ldp             q8, q9, [x7], #32                       /* rk0, rk1 */
+       ldp             q10, q11, [x7], #32                     /* rk2, rk3 */
+
+       ld1             {v13.16b}, [x0], #16
+       eor             v12.16b, v12.16b, v13.16b
+
+       ld1             {$W1.2d},[x10],#16
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       ld1             {$W1.2d},[x10],#16
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       ldp             q8, q9, [x7], #32                       /* rk4, rk5 */
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       ld1             {$W1.2d},[x10],#16
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       ldp             q10, q11, [x7], #32                     /* rk6, rk7 */
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       ld1             {$W1.2d},[x10],#16
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       ldp             q8, q9, [x7], #32                       /* rk8, rk9 */
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       sub             x10, x10, #80*8 // rewind
+       add             $W0.2d, $W0.2d, $MSG[0].2d              /* Kt + Wt */
+       ext             $W0.16b, $W0.16b, $W0.16b, #8
+       ext             $FG.16b, @H[2].16b, @H[3].16b, #8
+       ext             $DE.16b, @H[1].16b, @H[2].16b, #8
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       /* T1 = h + Kt + Wt*/
+       add             @H[3].2d, @H[3].2d, $W0.2d
+       /* T1 = T1 + BSIG1(e) + CH(e,f,g) */
+       sha512h         @QH[3], $QFG, $DE.2d
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       cmp             x9, #12
+       add             @H[4].2d, @H[1].2d, @H[3].2d            /* d + T1 */
+       /* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
+       sha512h2        @QH[3], @QH[1], @H[0].2d
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       # h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+       @QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
+$code.=<<___;
+       b.lt            .Lenc_main_loop_aes128_7
+.Lenc_main_loop_aes192_7:
+       ldp             q10, q11, [x7], #32             /* rk10, rk11 */
+       aese            v12.16b, v9.16b
+       aesmc           v12.16b, v12.16b
+       aese            v12.16b, v10.16b
+       aesmc           v12.16b, v12.16b
+       b.gt            .Lenc_main_loop_aes256_7
+       ld1             {v8.16b},[x7]                   /* rk12 */
+       aese            v12.16b, v11.16b
+       eor             v12.16b, v12.16b, v8.16b
+       b               1f
+.Lenc_main_loop_aes256_7:
+       ldp             q8, q9, [x7], #32               /* rk12, rk13 */
+       aese            v12.16b, v11.16b
+       aesmc           v12.16b, v12.16b
+       ld1             {v10.16b},[x7]                  /* rk14 */
+       aese            v12.16b, v8.16b
+       aesmc           v12.16b, v12.16b
+       aese            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+       b               1f
+.Lenc_main_loop_aes128_7:
+       ld1             {v10.16b},[x7]                  /* rk10 */
+       aese            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+1:
+       add             @H[0].2d, @H[0].2d, $AB.2d
+       add             @H[1].2d, @H[1].2d, $CD.2d
+       add             @H[2].2d, @H[2].2d, $EF.2d
+       add             @H[3].2d, @H[3].2d, $GH.2d
+
+       st1             {v12.16b}, [x1], #16
+
+       ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x14], #64
+       ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x14]
+
+       sub             x11, x11, #8
+       cmp             x11, #8
+       b.ge    .Lenc_main_loop
+
+       /* epilog - process sha block */
+___
+       &sha512_block(1);
+$code.=<<___;
+       mov             x7, x12
+       ld1             {v0.16b}, [x0], #16             /* load plaintext */
+       ldr             q1, [x14, #48]                  /* load the last output of aes block */
+       eor             v0.16b, v0.16b, v1.16b
+
+       ldp             q8, q9, [x7], #32               /* rk0, rk1 */
+       ldp             q10, q11, [x7], #32             /* rk2, rk3 */
+       ldp             q12, q13, [x7], #32             /* rk4, rk5 */
+       ldp             q14, q15, [x7], #32             /* rk6, rk7 */
+       ldp             q16, q17, [x7], #32             /* rk8, rk9 */
+       ld1             {v18.16b}, [x7]                 /* rk10 */
+
+       mov             w12, #0x80                              /* sha padding 0b10000000 */
+       b               .Lenc_less_than_8_block
+
+       /* aes_block < 16 */
+.Lenc_short_case:
+       ld1             {v0.16b}, [x0], #16             /* load plaintext */
+       ld1             {v1.16b}, [x8]                  /* load iv */
+       ldp             q8, q9, [x7], #32               /* rk0, rk1 */
+       ldp             q10, q11, [x7], #32             /* rk2, rk3 */
+       ldp             q12, q13, [x7], #32             /* rk4, rk5 */
+       ldp             q14, q15, [x7], #32             /* rk6, rk7 */
+       ldp             q16, q17, [x7], #32             /* rk8, rk9 */
+       ld1             {v18.16b}, [x7]                 /* rk10 */
+       mov             w12, #0x80                              /* sha padding 0b10000000 */
+
+       eor             v0.16b, v0.16b, v1.16b  /* iv xor plaintext */
+
+       cmp             x11, #8
+       b.lt            .Lenc_less_than_8_block
+___
+# process 8 aes blocks
+for($i = 0; $i < 8; $i = $i + 1) {
+       &aes_block_9_rounds($i);
+       # only tell 128/192/256 at the first time
+       &aes_block_last_rounds(($i == 0)?1:0, "enc_short", $i, 0);
+       if($i != 7) {
+               $next = $i + 1;
+$code.=<<___;
+       /* load next block */
+       ld1             {v$next.16b}, [x0], #16
+       /* output xor block */
+       eor             v$next.16b, v$next.16b, v$i.16b
+___
+       }
+}
+$code.=<<___;
+       /* store 8 blocks of ciphertext */
+       stp             q0, q1, [x1], #32
+       stp             q2, q3, [x1], #32
+       stp             q4, q5, [x1], #32
+       stp             q6, q7, [x1], #32
+
+       sub             x11, x11, #8
+___
+       # now we have a whole sha512 block
+       &sha512_block(1);
+$code.=<<___;
+       ldr             x7, [x6, #CIPHER_KEY]
+       ldp             q8, q9, [x7]                    /* restore clobbered rk0, rk1 */
+       add             x7, x7, #160                    /* x7 point to rk10 */
+       cbz             x11, .Lenc_short_no_more_aes_block
+       ld1             {v0.16b}, [x0], #16             /* load plaintext */
+       ldr             q1, [x1, -16]
+       eor             v0.16b, v0.16b, v1.16b
+.Lenc_less_than_8_block:
+       cbz             x11, .Lenc_short_no_more_aes_block
+___
+# process remained aes blocks (<= 7)
+for($i = 0; $i < 7; $i = $i + 1) {
+       &aes_block_9_rounds($i);
+       &aes_block_last_rounds(($i == 0)?1:0, "enc_short_partial", $i, 0);
+$code.=<<___;
+       str             q$i, [x1], #16
+       sub             x11, x11, #1
+       cbz             x11, .Lenc_short_post_Q$i
+___
+       if($i != 6) {
+               $next = $i + 1;
+$code.=<<___;
+       /* load next block*/
+       ld1             {v$next.16b}, [x0], #16
+       /* output xor block */
+       eor             v$next.16b, v$next.16b, v$i.16b
+___
+       }
+}
+$code.=<<___;
+.Lenc_short_no_more_aes_block:
+       eor             v0.16b, v0.16b, v0.16b
+       eor             v1.16b, v1.16b, v1.16b
+       eor             v2.16b, v2.16b, v2.16b
+       eor             v3.16b, v3.16b, v3.16b
+       eor             v4.16b, v4.16b, v4.16b
+       eor             v5.16b, v5.16b, v5.16b
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+       mov             v0.b[0], w12
+       b               .Lenc_short_post_sha
+.Lenc_short_post_Q0:
+       eor             v1.16b, v1.16b, v1.16b
+       eor             v2.16b, v2.16b, v2.16b
+       eor             v3.16b, v3.16b, v3.16b
+       eor             v4.16b, v4.16b, v4.16b
+       eor             v5.16b, v5.16b, v5.16b
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+       mov             v1.b[0], w12
+       b               .Lenc_short_post_sha
+.Lenc_short_post_Q1:
+       eor             v2.16b, v2.16b, v2.16b
+       eor             v3.16b, v3.16b, v3.16b
+       eor             v4.16b, v4.16b, v4.16b
+       eor             v5.16b, v5.16b, v5.16b
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+       mov             v2.b[0], w12
+       b               .Lenc_short_post_sha
+.Lenc_short_post_Q2:
+       eor             v3.16b, v3.16b, v3.16b
+       eor             v4.16b, v4.16b, v4.16b
+       eor             v5.16b, v5.16b, v5.16b
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+       mov             v3.b[0], w12
+       b               .Lenc_short_post_sha
+.Lenc_short_post_Q3:
+       eor             v4.16b, v4.16b, v4.16b
+       eor             v5.16b, v5.16b, v5.16b
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+       mov             v4.b[0], w12
+       b               .Lenc_short_post_sha
+.Lenc_short_post_Q4:
+       eor             v5.16b, v5.16b, v5.16b
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+       mov             v5.b[0], w12
+       b               .Lenc_short_post_sha
+.Lenc_short_post_Q5:
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+       mov             v6.b[0], w12
+       b               .Lenc_short_post_sha
+.Lenc_short_post_Q6:
+       eor             v7.16b, v7.16b, v7.16b
+       mov             v7.b[0], w12
+       /* we have one padded sha512 block now, process it and
+          then employ another one to host sha length */
+___
+&sha512_block(1);
+$code.=<<___;
+       eor             v0.16b, v0.16b, v0.16b
+       eor             v1.16b, v1.16b, v1.16b
+       eor             v2.16b, v2.16b, v2.16b
+       eor             v3.16b, v3.16b, v3.16b
+       eor             v4.16b, v4.16b, v4.16b
+       eor             v5.16b, v5.16b, v5.16b
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+.Lenc_short_post_sha:
+       /* we have last padded sha512 block now */
+       eor             x13, x13, x13                   /* length_lo */
+       eor             x14, x14, x14                   /* length_hi */
+
+       adds            x13, x13, x2, lsl #3            /* add len in bits */
+       lsr             x15, x2, #61
+       adc             x14, x14, x15
+
+       adds            x13, x13, #1024                 /* add i_key_pad 1024 bits */
+       adc             x14, x14, xzr
+
+       mov             v7.d[0], x14
+       mov             v7.d[1], x13
+       rev64           v7.16b, v7.16b
+___
+&sha512_block(1);
+$code.=<<___;
+       /* Final HMAC - opad part */
+       mov             v0.16b, v24.16b
+       mov             v1.16b, v25.16b
+       mov             v2.16b, v26.16b
+       mov             v3.16b, v27.16b
+       eor             v4.16b, v4.16b, v4.16b
+       eor             v5.16b, v5.16b, v5.16b
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+
+       mov             v4.b[7], w12                    /* padding 1 */
+       mov             x13, #1024+512                  /* length in bits */
+       mov             v7.d[1], x13
+
+       /* load ABCDEFGH for opad */
+       ldr             x7, [x6, #HMAC_OKEYPAD]
+       ld1             {v24.2d, v25.2d, v26.2d, v27.2d}, [x7]
+___
+&sha512_block(0);
+$code.=<<___;
+.Lret:
+       mov             x0, xzr                         /* return 0 */
+
+       rev64           v24.16b, v24.16b
+       rev64           v25.16b, v25.16b
+       rev64           v26.16b, v26.16b
+       rev64           v27.16b, v27.16b
+
+       /* store hash result */
+       st1             {v24.2d,v25.2d,v26.2d,v27.2d},[x4]
+
+       /* restore callee save register */
+       ldp             d10, d11, [sp,#16]
+       ldp             d12, d13, [sp,#32]
+       ldp             d14, d15, [sp,#48]
+       ldp             d8, d9, [sp], #64
+       ret
+.size asm_aescbc_sha512_hmac, .-asm_aescbc_sha512_hmac
+___
+}
+
+{
+       my @H = map("v$_",(24..28));
+       my @QH = map("q$_",(24..28));
+       my ($FG, $DE) = map("v$_",(29..30));
+       my ($QFG, $QDE) = map("q$_",(29..30));
+       my $M9_10 = "v31";
+       my @MSG = map("v$_", (0..7));
+       my ($W0, $W1) = ("v14", "v15");
+       my ($AB, $CD, $EF, $GH) = map("v$_",(20..23));
+
+$code.=<<___;
+/*
+ * asm_sha512_hmac_aescbc_dec(
+ *     csrc,   x0      (cipher src address)
+ *     cdst,   x1      (cipher dst address)
+ *     clen    x2      (cipher length)
+ *     dsrc,   x3      (digest src address)
+ *     ddst,   x4      (digest dst address)
+ *     dlen,   x5      (digest length)
+ *     arg     x6      :
+ *             arg->cipher.key                 (round keys)
+ *             arg->cipher.key_rounds          (key rounds)
+ *             arg->cipher.iv                  (initialization vector)
+ *             arg->digest.hmac.i_key_pad      (partially hashed i_key_pad)
+ *             arg->digest.hmac.o_key_pad      (partially hashed o_key_pad)
+ *     )
+ */
+
+.global asm_sha512_hmac_aescbc_dec
+.type  asm_sha512_hmac_aescbc_dec,%function
+
+.align 4
+asm_sha512_hmac_aescbc_dec:
+       AARCH64_VALID_CALL_TARGET
+       /* save callee save register */
+       stp             d8, d9, [sp,#-64]!
+       stp             d10, d11, [sp,#16]
+       stp             d12, d13, [sp,#32]
+       stp             d14, d15, [sp,#48]
+
+       /* load ABCDEFGH */
+       ldr             x7, [x6, #HMAC_IKEYPAD]
+       ld1             {v24.2d, v25.2d, v26.2d, v27.2d}, [x7]
+
+       ldr             x7, [x6, #CIPHER_KEY]
+       ldr             x8, [x6, #CIPHER_IV]
+       ldr             x9, [x6, #CIPHER_KEY_ROUNDS]
+       mov             x12, x7                 /* backup x7 */
+
+       adr             x10, .LK512
+
+       lsr             x11, x2, #4             /* aes_block = len/16 */
+       cbz             x11, .Ldec_ret          /* return if aes_block = 0 */
+
+       ld1             {v20.16b}, [x8]         /* load iv */
+       cmp             x11, #8
+       b.lt            .Ldec_short_case
+.Ldec_main_loop:
+       ldp             q12, q13, [x0], #32
+       ldp             q14, q15, [x0], #32
+       ldp             q16, q17, [x0], #32
+       ldp             q18, q19, [x0], #32
+
+       ldp             q8, q9, [x7], #32       /* rk0, rk1 */
+       ldp             q10, q11, [x7], #32     /* rk2, rk3 */
+
+       mov             v0.16b, v12.16b
+       mov             v1.16b, v13.16b
+       mov             v2.16b, v14.16b
+       mov             v3.16b, v15.16b
+       mov             v4.16b, v16.16b
+       mov             v5.16b, v17.16b
+       mov             v6.16b, v18.16b
+       mov             v7.16b, v19.16b
+
+       /* 1 round */
+       aesd            v12.16b, v8.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v8.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v8.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v8.16b
+       aesimc          v15.16b, v15.16b
+       aesd            v16.16b, v8.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v17.16b, v8.16b
+       aesimc          v17.16b, v17.16b
+       aesd            v18.16b, v8.16b
+       aesimc          v18.16b, v18.16b
+       aesd            v19.16b, v8.16b
+       aesimc          v19.16b, v19.16b
+
+       /* 2 round */
+       aesd            v12.16b, v9.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v9.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v9.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v9.16b
+       aesimc          v15.16b, v15.16b
+       aesd            v16.16b, v9.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v17.16b, v9.16b
+       aesimc          v17.16b, v17.16b
+       aesd            v18.16b, v9.16b
+       aesimc          v18.16b, v18.16b
+       aesd            v19.16b, v9.16b
+       aesimc          v19.16b, v19.16b
+
+       ldp             q8, q9, [x7], #32       /* rk4, rk5 */
+
+       /* 3 round */
+       aesd            v12.16b, v10.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v10.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v10.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v10.16b
+       aesimc          v15.16b, v15.16b
+       aesd            v16.16b, v10.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v17.16b, v10.16b
+       aesimc          v17.16b, v17.16b
+       aesd            v18.16b, v10.16b
+       aesimc          v18.16b, v18.16b
+       aesd            v19.16b, v10.16b
+       aesimc          v19.16b, v19.16b
+
+       /* 4 round */
+       aesd            v12.16b, v11.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v11.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v11.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v11.16b
+       aesimc          v15.16b, v15.16b
+       aesd            v16.16b, v11.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v17.16b, v11.16b
+       aesimc          v17.16b, v17.16b
+       aesd            v18.16b, v11.16b
+       aesimc          v18.16b, v18.16b
+       aesd            v19.16b, v11.16b
+       aesimc          v19.16b, v19.16b
+
+       ldp             q10, q11, [x7], #32     /* rk6, rk7 */
+
+       /* 5 round */
+       aesd            v12.16b, v8.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v8.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v8.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v8.16b
+       aesimc          v15.16b, v15.16b
+       aesd            v16.16b, v8.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v17.16b, v8.16b
+       aesimc          v17.16b, v17.16b
+       aesd            v18.16b, v8.16b
+       aesimc          v18.16b, v18.16b
+       aesd            v19.16b, v8.16b
+       aesimc          v19.16b, v19.16b
+
+       /* 6 round */
+       aesd            v12.16b, v9.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v9.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v9.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v9.16b
+       aesimc          v15.16b, v15.16b
+       aesd            v16.16b, v9.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v17.16b, v9.16b
+       aesimc          v17.16b, v17.16b
+       aesd            v18.16b, v9.16b
+       aesimc          v18.16b, v18.16b
+       aesd            v19.16b, v9.16b
+       aesimc          v19.16b, v19.16b
+
+       ldp             q8, q9, [x7], #32       /* rk8, rk9 */
+
+       /* 7 round */
+       aesd            v12.16b, v10.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v10.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v10.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v10.16b
+       aesimc          v15.16b, v15.16b
+       aesd            v16.16b, v10.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v17.16b, v10.16b
+       aesimc          v17.16b, v17.16b
+       aesd            v18.16b, v10.16b
+       aesimc          v18.16b, v18.16b
+       aesd            v19.16b, v10.16b
+       aesimc          v19.16b, v19.16b
+
+       /* 8 round */
+       aesd            v12.16b, v11.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v11.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v11.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v11.16b
+       aesimc          v15.16b, v15.16b
+       aesd            v16.16b, v11.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v17.16b, v11.16b
+       aesimc          v17.16b, v17.16b
+       aesd            v18.16b, v11.16b
+       aesimc          v18.16b, v18.16b
+       aesd            v19.16b, v11.16b
+       aesimc          v19.16b, v19.16b
+
+       /* 9 round */
+       aesd            v12.16b, v8.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v8.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v8.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v8.16b
+       aesimc          v15.16b, v15.16b
+       aesd            v16.16b, v8.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v17.16b, v8.16b
+       aesimc          v17.16b, v17.16b
+       aesd            v18.16b, v8.16b
+       aesimc          v18.16b, v18.16b
+       aesd            v19.16b, v8.16b
+       aesimc          v19.16b, v19.16b
+
+       cmp             x9, #12                 /* tell 128,192,256 apart */
+
+       b.lt            .Laes128_dec_main
+.Laes192_dec_main:
+       ldp             q10,q11,[x7],32         /* rk10,rk11 */
+       /* 10 round */
+       aesd            v12.16b, v9.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v9.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v9.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v9.16b
+       aesimc          v15.16b, v15.16b
+       aesd            v16.16b, v9.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v17.16b, v9.16b
+       aesimc          v17.16b, v17.16b
+       aesd            v18.16b, v9.16b
+       aesimc          v18.16b, v18.16b
+       aesd            v19.16b, v9.16b
+       aesimc          v19.16b, v19.16b
+
+       /* 11 round */
+       aesd            v12.16b, v10.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v10.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v10.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v10.16b
+       aesimc          v15.16b, v15.16b
+       aesd            v16.16b, v10.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v17.16b, v10.16b
+       aesimc          v17.16b, v17.16b
+       aesd            v18.16b, v10.16b
+       aesimc          v18.16b, v18.16b
+       aesd            v19.16b, v10.16b
+       aesimc          v19.16b, v19.16b
+       b.gt            .Laes256_dec_main
+
+       ld1             {v8.16b},[x7]           /* rk12 */
+
+       /*12 round */
+       aesd            v12.16b, v11.16b
+       eor             v12.16b, v12.16b, v8.16b
+       aesd            v13.16b, v11.16b
+       eor             v13.16b, v13.16b, v8.16b
+       aesd            v14.16b, v11.16b
+       eor             v14.16b, v14.16b, v8.16b
+       aesd            v15.16b, v11.16b
+       eor             v15.16b, v15.16b, v8.16b
+       aesd            v16.16b, v11.16b
+       eor             v16.16b, v16.16b, v8.16b
+       aesd            v17.16b, v11.16b
+       eor             v17.16b, v17.16b, v8.16b
+       aesd            v18.16b, v11.16b
+       eor             v18.16b, v18.16b, v8.16b
+       aesd            v19.16b, v11.16b
+       eor             v19.16b, v19.16b, v8.16b
+
+       sub             x7, x7, #192            /* rewind x7 */
+       b               1f
+.Laes256_dec_main:
+       ldp             q8,q9,[x7],32           /* rk12,rk13 */
+       /* 12 round */
+       aesd            v12.16b, v11.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v11.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v11.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v11.16b
+       aesimc          v15.16b, v15.16b
+       aesd            v16.16b, v11.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v17.16b, v11.16b
+       aesimc          v17.16b, v17.16b
+       aesd            v18.16b, v11.16b
+       aesimc          v18.16b, v18.16b
+       aesd            v19.16b, v11.16b
+       aesimc          v19.16b, v19.16b
+
+       /* 13 round */
+       aesd            v12.16b, v8.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v8.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v8.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v8.16b
+       aesimc          v15.16b, v15.16b
+       aesd            v16.16b, v8.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v17.16b, v8.16b
+       aesimc          v17.16b, v17.16b
+       aesd            v18.16b, v8.16b
+       aesimc          v18.16b, v18.16b
+       aesd            v19.16b, v8.16b
+       aesimc          v19.16b, v19.16b
+
+       ld1             {v10.16b},[x7]                  /* rk14 */
+
+       /* 14 round */
+       aesd            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+       aesd            v13.16b, v9.16b
+       eor             v13.16b, v13.16b, v10.16b
+       aesd            v14.16b, v9.16b
+       eor             v14.16b, v14.16b, v10.16b
+       aesd            v15.16b, v9.16b
+       eor             v15.16b, v15.16b, v10.16b
+       aesd            v16.16b, v9.16b
+       eor             v16.16b, v16.16b, v10.16b
+       aesd            v17.16b, v9.16b
+       eor             v17.16b, v17.16b, v10.16b
+       aesd            v18.16b, v9.16b
+       eor             v18.16b, v18.16b, v10.16b
+       aesd            v19.16b, v9.16b
+       eor             v19.16b, v19.16b, v10.16b
+
+       sub             x7, x7, #224
+       b               1f
+.Laes128_dec_main:
+       ld1             {v10.16b},[x7]                  /* rk10 */
+       aesd            v12.16b,v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+       aesd            v13.16b,v9.16b
+       eor             v13.16b, v13.16b, v10.16b
+       aesd            v14.16b,v9.16b
+       eor             v14.16b, v14.16b, v10.16b
+       aesd            v15.16b,v9.16b
+       eor             v15.16b, v15.16b, v10.16b
+       aesd            v16.16b,v9.16b
+       eor             v16.16b, v16.16b, v10.16b
+       aesd            v17.16b,v9.16b
+       eor             v17.16b, v17.16b, v10.16b
+       aesd            v18.16b,v9.16b
+       eor             v18.16b, v18.16b, v10.16b
+       aesd            v19.16b,v9.16b
+       eor             v19.16b, v19.16b, v10.16b
+       sub             x7, x7, #160
+
+1:
+       eor             v12.16b, v12.16b, v20.16b
+       eor             v13.16b, v13.16b, v0.16b
+       eor             v14.16b, v14.16b, v1.16b
+       eor             v15.16b, v15.16b, v2.16b
+       eor             v16.16b, v16.16b, v3.16b
+       eor             v17.16b, v17.16b, v4.16b
+       eor             v18.16b, v18.16b, v5.16b
+       eor             v19.16b, v19.16b, v6.16b
+
+       stp             q12,q13, [x1], #32
+       ldr             q12, [x0, #-16]         /* load last cipher */
+       stp             q14,q15, [x1], #32
+       stp             q16,q17, [x1], #32
+       stp             q18,q19, [x1], #32
+___
+       &sha512_block(1);
+$code.=<<___;
+       mov             v20.16b, v12.16b        /* load last cipher */
+       sub             x11, x11, #8
+       cmp             x11, #8
+       b.ge            .Ldec_main_loop
+
+       /* aes_block < 8 */
+.Ldec_short_case:
+       mov             w12, #0x80              /* sha padding 0b10000000 */
+       cbnz            x11, 1f
+       eor             v0.16b, v0.16b, v0.16b
+       eor             v1.16b, v1.16b, v1.16b
+       eor             v2.16b, v2.16b, v2.16b
+       eor             v3.16b, v3.16b, v3.16b
+       eor             v4.16b, v4.16b, v4.16b
+       eor             v5.16b, v5.16b, v5.16b
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+       mov             v0.b[0], w12
+       b               .Ldec_short_post_sha
+1:
+       cmp             x11, #4
+       b.lt            .Ldec_less_than_4_block
+
+       ldp             q8, q9, [x7], #32       /* rk0, rk1 */
+       ldp             q10, q11, [x7], #32     /* rk2, rk3 */
+
+       ldp             q12, q13, [x0], #32
+       ldp             q14, q15, [x0], #32
+
+       mov             v0.16b, v12.16b
+       mov             v1.16b, v13.16b
+       mov             v2.16b, v14.16b
+       mov             v3.16b, v15.16b
+
+       /* 1 round */
+       aesd            v12.16b, v8.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v8.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v8.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v8.16b
+       aesimc          v15.16b, v15.16b
+
+       /* 2 round */
+       aesd            v12.16b, v9.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v9.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v9.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v9.16b
+       aesimc          v15.16b, v15.16b
+
+       ldp             q8, q9, [x7], #32       /* rk4, rk5 */
+
+       /* 3 round */
+       aesd            v12.16b, v10.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v10.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v10.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v10.16b
+       aesimc          v15.16b, v15.16b
+
+       /* 4 round */
+       aesd            v12.16b, v11.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v11.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v11.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v11.16b
+       aesimc          v15.16b, v15.16b
+
+       ldp             q10, q11, [x7], #32     /* rk6, rk7 */
+
+       /* 5 round */
+       aesd            v12.16b, v8.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v8.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v8.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v8.16b
+       aesimc          v15.16b, v15.16b
+
+       /* 6 round */
+       aesd            v12.16b, v9.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v9.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v9.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v9.16b
+       aesimc          v15.16b, v15.16b
+
+       ldp             q8, q9, [x7], #32       /* rk8, rk9 */
+
+       /* 7 round */
+       aesd            v12.16b, v10.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v10.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v10.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v10.16b
+       aesimc          v15.16b, v15.16b
+
+       /* 8 round */
+       aesd            v12.16b, v11.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v11.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v11.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v11.16b
+       aesimc          v15.16b, v15.16b
+
+       /* 9 round */
+       aesd            v12.16b, v8.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v8.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v8.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v8.16b
+       aesimc          v15.16b, v15.16b
+
+       cmp             x9, #12                 /* tell 128,192,256 apart */
+
+       b.lt            .Laes128_dec_short
+.Laes192_dec_short:
+       ldp             q10,q11,[x7],32         /* rk10,rk11 */
+
+       /* 10 round */
+       aesd            v12.16b, v9.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v9.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v9.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v9.16b
+       aesimc          v15.16b, v15.16b
+
+       /* 11 round */
+       aesd            v12.16b, v10.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v10.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v10.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v10.16b
+       aesimc          v15.16b, v15.16b
+       b.gt            .Laes256_dec_short
+
+       ld1             {v8.16b},[x7]                   /* rk12 */
+
+       /*12 round */
+       aesd            v12.16b, v11.16b
+       eor             v12.16b, v12.16b, v8.16b
+       aesd            v13.16b, v11.16b
+       eor             v13.16b, v13.16b, v8.16b
+       aesd            v14.16b, v11.16b
+       eor             v14.16b, v14.16b, v8.16b
+       aesd            v15.16b, v11.16b
+       eor             v15.16b, v15.16b, v8.16b
+
+       sub             x7, x7, #192                    /* rewind x7 */
+       b               1f
+.Laes256_dec_short:
+       ldp             q8,q9,[x7],32                   /* rk12,rk13 */
+       /* 12 round */
+       aesd            v12.16b, v11.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v11.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v11.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v11.16b
+       aesimc          v15.16b, v15.16b
+
+       /* 13 round */
+       aesd            v12.16b, v8.16b
+       aesimc          v12.16b, v12.16b
+       aesd            v13.16b, v8.16b
+       aesimc          v13.16b, v13.16b
+       aesd            v14.16b, v8.16b
+       aesimc          v14.16b, v14.16b
+       aesd            v15.16b, v8.16b
+       aesimc          v15.16b, v15.16b
+
+       ld1             {v10.16b},[x7]                  /* rk14 */
+
+       /* 14 round */
+       aesd            v12.16b, v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+       aesd            v13.16b, v9.16b
+       eor             v13.16b, v13.16b, v10.16b
+       aesd            v14.16b, v9.16b
+       eor             v14.16b, v14.16b, v10.16b
+       aesd            v15.16b, v9.16b
+       eor             v15.16b, v15.16b, v10.16b
+
+       sub             x7, x7, #224
+       b               1f
+.Laes128_dec_short:
+       ld1             {v10.16b},[x7]                  /* rk10 */
+       aesd            v12.16b,v9.16b
+       eor             v12.16b, v12.16b, v10.16b
+       aesd            v13.16b,v9.16b
+       eor             v13.16b, v13.16b, v10.16b
+       aesd            v14.16b,v9.16b
+       eor             v14.16b, v14.16b, v10.16b
+       aesd            v15.16b,v9.16b
+       eor             v15.16b, v15.16b, v10.16b
+       sub             x7, x7, #160
+1:
+       eor             v12.16b, v12.16b, v20.16b
+       eor             v13.16b, v13.16b, v0.16b
+       eor             v14.16b, v14.16b, v1.16b
+       eor             v15.16b, v15.16b, v2.16b
+       ldr             q20, [x0, #-16]
+
+       sub             x11, x11, #4
+
+       stp             q12,q13, [x1], #32
+       stp             q14,q15, [x1], #32
+       cbz             x11, .Ldec_short_post_Q3
+___
+for($i = 0; $i < 3; $i = $i + 1) {
+       $block = $i + 4;
+$code.=<<___;
+       ld1             {v16.16b}, [x0], #16
+       mov             v$block.16b, v16.16b
+
+       ldp             q8, q9, [x7], #32               /* rk0, rk1 */
+       ldp             q10, q11, [x7], #32             /* rk2, rk3 */
+
+       aesd            v16.16b, v8.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v16.16b, v9.16b
+       aesimc          v16.16b, v16.16b
+       ldp             q8, q9, [x7], #32               /* rk4, rk5 */
+       aesd            v16.16b, v10.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v16.16b, v11.16b
+       aesimc          v16.16b, v16.16b
+       ldp             q10, q11, [x7], #32             /* rk6, rk7 */
+       aesd            v16.16b, v8.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v16.16b, v9.16b
+       aesimc          v16.16b, v16.16b
+       ldp             q8, q9, [x7], #32               /* rk8, rk9 */
+       aesd            v16.16b, v10.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v16.16b, v11.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v16.16b, v8.16b
+       aesimc          v16.16b, v16.16b
+       cmp             x9, #12                 /* tell 128,192,256 apart */
+       b.lt            .Laes128_dec_short_$block
+.Laes192_dec_short_$block:
+       ldp             q10,q11,[x7],32                 /* rk10,rk11 */
+       aesd            v16.16b, v9.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v16.16b, v10.16b
+       aesimc          v16.16b, v16.16b
+       b.gt            .Laes256_dec_short_$block
+       ld1             {v8.16b},[x7]                   /* rk12 */
+       aesd            v16.16b, v11.16b
+       eor             v16.16b, v16.16b, v8.16b
+       sub             x7, x7, #192                    /* rewind x7 */
+       b               1f
+.Laes256_dec_short_$block:
+       ldp             q8,q9,[x7],32                   /* rk12,rk13 */
+       aesd            v16.16b, v11.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v16.16b, v8.16b
+       aesimc          v16.16b, v16.16b
+       ld1             {v10.16b},[x7]                  /* rk14 */
+       aesd            v16.16b, v9.16b
+       eor             v16.16b, v16.16b, v10.16b
+       sub             x7, x7, #224
+       b               1f
+.Laes128_dec_short_$block:
+       ld1             {v10.16b},[x7]                  /* rk10 */
+       aesd            v16.16b,v9.16b
+       eor             v16.16b, v16.16b, v10.16b
+       sub             x7, x7, #160
+1:
+       sub             x11, x11, 1
+       eor             v16.16b, v16.16b, v20.16b
+       ldr             q20, [x0, #-16]
+       st1             {v16.16b}, [x1], #16
+       cbz             x11, .Ldec_short_post_Q$block
+___
+}
+$code.=<<___;
+.Ldec_short_post_Q3:
+       eor             v4.16b, v4.16b, v4.16b
+       eor             v5.16b, v5.16b, v5.16b
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+       mov             v4.b[0], w12
+       b               .Ldec_short_post_sha
+.Ldec_short_post_Q4:
+       eor             v5.16b, v5.16b, v5.16b
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+       mov             v5.b[0], w12
+       b               .Ldec_short_post_sha
+.Ldec_short_post_Q5:
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+       mov             v6.b[0], w12
+       b               .Ldec_short_post_sha
+.Ldec_short_post_Q6:
+       eor             v7.16b, v7.16b, v7.16b
+       mov             v7.b[0], w12
+       /* we have one padded sha512 block now, process it and
+          then employ another one to host sha length */
+___
+&sha512_block(1);
+$code.=<<___;
+       eor             v0.16b, v0.16b, v0.16b
+       eor             v1.16b, v1.16b, v1.16b
+       eor             v2.16b, v2.16b, v2.16b
+       eor             v3.16b, v3.16b, v3.16b
+       eor             v4.16b, v4.16b, v4.16b
+       eor             v5.16b, v5.16b, v5.16b
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+       b               .Ldec_short_post_sha
+
+.Ldec_less_than_4_block:
+___
+for($i = 0; $i < 3; $i = $i + 1) {
+$code.=<<___;
+       ld1             {v16.16b}, [x0], #16
+       mov             v$i.16b, v16.16b
+
+       ldp             q8, q9, [x7], #32               /* rk0, rk1 */
+       ldp             q10, q11, [x7], #32             /* rk2, rk3 */
+
+       aesd            v16.16b, v8.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v16.16b, v9.16b
+       aesimc          v16.16b, v16.16b
+       ldp             q8, q9, [x7], #32               /* rk4, rk5 */
+       aesd            v16.16b, v10.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v16.16b, v11.16b
+       aesimc          v16.16b, v16.16b
+       ldp             q10, q11, [x7], #32             /* rk6, rk7 */
+       aesd            v16.16b, v8.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v16.16b, v9.16b
+       aesimc          v16.16b, v16.16b
+       ldp             q8, q9, [x7], #32               /* rk8, rk9 */
+       aesd            v16.16b, v10.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v16.16b, v11.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v16.16b, v8.16b
+       aesimc          v16.16b, v16.16b
+       cmp             x9, #12                         /* tell 128,192,256 apart */
+       b.lt            .Laes128_dec_short_less_than_4_$i
+.Laes192_dec_short_less_than_4_$i:
+       ldp             q10,q11,[x7],32                 /* rk10,rk11 */
+       aesd            v16.16b, v9.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v16.16b, v10.16b
+       aesimc          v16.16b, v16.16b
+       b.gt            .Laes256_dec_short_less_than_4_$i
+       ld1             {v8.16b},[x7]                   /* rk12 */
+       aesd            v16.16b, v11.16b
+       eor             v16.16b, v16.16b, v8.16b
+       sub             x7, x7, #192                    /* rewind x7 */
+       b               1f
+.Laes256_dec_short_less_than_4_$i:
+       ldp             q8,q9,[x7],32                   /* rk12,rk13 */
+       aesd            v16.16b, v11.16b
+       aesimc          v16.16b, v16.16b
+       aesd            v16.16b, v8.16b
+       aesimc          v16.16b, v16.16b
+       ld1             {v10.16b},[x7]                  /* rk14 */
+       aesd            v16.16b, v9.16b
+       eor             v16.16b, v16.16b, v10.16b
+       sub             x7, x7, #224
+       b               1f
+.Laes128_dec_short_less_than_4_$i:
+       ld1             {v10.16b},[x7]                  /* rk10 */
+       aesd            v16.16b,v9.16b
+       eor             v16.16b, v16.16b, v10.16b
+       sub             x7, x7, #160
+1:
+       sub             x11, x11, 1
+       eor             v16.16b, v16.16b, v20.16b
+       ldr             q20, [x0, #-16]
+       st1             {v16.16b}, [x1], #16
+       cbz             x11, .Ldec_short_post_Q$i
+___
+}
+$code.=<<___;
+.Ldec_short_post_Q0:
+       eor             v1.16b, v1.16b, v1.16b
+       eor             v2.16b, v2.16b, v2.16b
+       eor             v3.16b, v3.16b, v3.16b
+       eor             v4.16b, v4.16b, v4.16b
+       eor             v5.16b, v5.16b, v5.16b
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+       mov             v1.b[0], w12
+       b               .Ldec_short_post_sha
+.Ldec_short_post_Q1:
+       eor             v2.16b, v2.16b, v2.16b
+       eor             v3.16b, v3.16b, v3.16b
+       eor             v4.16b, v4.16b, v4.16b
+       eor             v5.16b, v5.16b, v5.16b
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+       mov             v2.b[0], w12
+       b               .Ldec_short_post_sha
+.Ldec_short_post_Q2:
+       eor             v3.16b, v3.16b, v3.16b
+       eor             v4.16b, v4.16b, v4.16b
+       eor             v5.16b, v5.16b, v5.16b
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+       mov             v3.b[0], w12
+       b               .Ldec_short_post_sha
+.Ldec_short_post_sha:
+       /* we have last padded sha512 block now */
+       eor             x13, x13, x13           /* length_lo */
+       eor             x14, x14, x14           /* length_hi */
+
+       adds            x13, x13, x2, lsl #3    /* add len in bits */
+       lsr             x15, x2, #61
+       adc             x14, x14, x15
+
+       adds            x13, x13, #1024         /* add i_key_pad 1024 bits */
+       adc             x14, x14, xzr
+
+       mov             v7.d[0], x14
+       mov             v7.d[1], x13
+       rev64           v7.16b, v7.16b
+___
+&sha512_block(1);
+$code.=<<___;
+       /* Final HMAC - opad part */
+       mov             v0.16b, v24.16b
+       mov             v1.16b, v25.16b
+       mov             v2.16b, v26.16b
+       mov             v3.16b, v27.16b
+       eor             v4.16b, v4.16b, v4.16b
+       eor             v5.16b, v5.16b, v5.16b
+       eor             v6.16b, v6.16b, v6.16b
+       eor             v7.16b, v7.16b, v7.16b
+
+       mov             v4.b[7], w12            /* padding 1 */
+       mov             x13, #1024+512          /* length in bits */
+       mov             v7.d[1], x13
+
+       /* load ABCDEFGH for opad */
+       ldr             x7, [x6, #HMAC_OKEYPAD]
+       ld1             {v24.2d, v25.2d, v26.2d, v27.2d}, [x7]
+___
+&sha512_block(0);
+$code.=<<___;
+.Ldec_ret:
+       mov             x0, xzr                 /* return 0 */
+
+       rev64           v24.16b, v24.16b
+       rev64           v25.16b, v25.16b
+       rev64           v26.16b, v26.16b
+       rev64           v27.16b, v27.16b
+
+       /* store hash result */
+       st1             {v24.2d,v25.2d,v26.2d,v27.2d},[x4]
+
+       /* restore callee save register */
+       ldp             d10, d11, [sp,#16]
+       ldp             d12, d13, [sp,#32]
+       ldp             d14, d15, [sp,#48]
+       ldp             d8, d9, [sp], #64
+       ret
+.size asm_sha512_hmac_aescbc_dec, .-asm_sha512_hmac_aescbc_dec
+___
+}
+#########################################
+{      my  %opcode = (
+       "sha512h"       => 0xce608000,  "sha512h2"      => 0xce608400,
+       "sha512su0"     => 0xcec08000,  "sha512su1"     => 0xce608800   );
+
+       sub unsha512 {
+       my ($mnemonic,$arg)=@_;
+
+       $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+       &&
+       sprintf ".inst\t0x%08x\t//%s %s",
+                       $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+                       $mnemonic,$arg;
+       }
+}
+
+open SELF,$0;
+while(<SELF>) {
+       next if (/^#!/);
+       last if (!s/^#/\/\// and !/^$/);
+       print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+       s/\`([^\`]*)\`/eval($1)/ge;
+       s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge;
+       print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
\ No newline at end of file
index 661b34592f295aecff9b6524ced83e050ac1578e..ed79316b0068b52731eeb8603069765a64903881 100644 (file)
@@ -33,7 +33,7 @@ IF[{- !$disabled{asm} -}]
   $AESDEF_armv4=AES_ASM BSAES_ASM
   $AESASM_aarch64=\
         aes_core.c aes_cbc.c aesv8-armx.S bsaes-armv8.S vpaes-armv8.S \
-        aes-sha1-armv8.S aes-sha256-armv8.S
+        aes-sha1-armv8.S aes-sha256-armv8.S aes-sha512-armv8.S
   $AESDEF_aarch64=BSAES_ASM VPAES_ASM
 
   $AESASM_parisc11=aes_core.c aes_cbc.c aes-parisc.s
@@ -143,6 +143,8 @@ GENERATE[aes-sha1-armv8.S]=asm/aes-sha1-armv8.pl
 INCLUDE[aes-sha1-armv8.o]=..
 GENERATE[aes-sha256-armv8.S]=asm/aes-sha256-armv8.pl
 INCLUDE[aes-sha256-armv8.o]=..
+GENERATE[aes-sha512-armv8.S]=asm/aes-sha512-armv8.pl
+INCLUDE[aes-sha512-armv8.o]=..
 
 GENERATE[aes-armv4.S]=asm/aes-armv4.pl
 INCLUDE[aes-armv4.o]=..
index 4bd45d3558b1cb0f76a8092ccd275452c7cb6e65..dcd0f3f41c4543d0423df728fb8c73bee32f5620 100644 (file)
@@ -1350,7 +1350,7 @@ static const unsigned char so[9517] = {
     0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x03,0x2E,  /* [ 9507] OBJ_SLH_DSA_SHAKE_256f_WITH_SHAKE256 */
 };
 
-#define NUM_NID 1493
+#define NUM_NID 1496
 static const ASN1_OBJECT nid_objs[NUM_NID] = {
     {"UNDEF", "undefined", NID_undef},
     {"rsadsi", "RSA Data Security, Inc.", NID_rsadsi, 6, &so[0]},
@@ -2845,9 +2845,12 @@ static const ASN1_OBJECT nid_objs[NUM_NID] = {
     {"AES-128-CBC-HMAC-SHA256-ETM", "aes-128-cbc-hmac-sha256-etm", NID_aes_128_cbc_hmac_sha256_etm},
     {"AES-192-CBC-HMAC-SHA256-ETM", "aes-192-cbc-hmac-sha256-etm", NID_aes_192_cbc_hmac_sha256_etm},
     {"AES-256-CBC-HMAC-SHA256-ETM", "aes-256-cbc-hmac-sha256-etm", NID_aes_256_cbc_hmac_sha256_etm},
+    {"AES-128-CBC-HMAC-SHA512-ETM", "aes-128-cbc-hmac-sha512-etm", NID_aes_128_cbc_hmac_sha512_etm},
+    {"AES-192-CBC-HMAC-SHA512-ETM", "aes-192-cbc-hmac-sha512-etm", NID_aes_192_cbc_hmac_sha512_etm},
+    {"AES-256-CBC-HMAC-SHA512-ETM", "aes-256-cbc-hmac-sha512-etm", NID_aes_256_cbc_hmac_sha512_etm},
 };
 
-#define NUM_SN 1484
+#define NUM_SN 1487
 static const unsigned int sn_objs[NUM_SN] = {
      364,    /* "AD_DVCS" */
      419,    /* "AES-128-CBC" */
@@ -2855,6 +2858,7 @@ static const unsigned int sn_objs[NUM_SN] = {
     1487,    /* "AES-128-CBC-HMAC-SHA1-ETM" */
      948,    /* "AES-128-CBC-HMAC-SHA256" */
     1490,    /* "AES-128-CBC-HMAC-SHA256-ETM" */
+    1493,    /* "AES-128-CBC-HMAC-SHA512-ETM" */
      421,    /* "AES-128-CFB" */
      650,    /* "AES-128-CFB1" */
      653,    /* "AES-128-CFB8" */
@@ -2869,6 +2873,7 @@ static const unsigned int sn_objs[NUM_SN] = {
     1488,    /* "AES-192-CBC-HMAC-SHA1-ETM" */
      949,    /* "AES-192-CBC-HMAC-SHA256" */
     1491,    /* "AES-192-CBC-HMAC-SHA256-ETM" */
+    1494,    /* "AES-192-CBC-HMAC-SHA512-ETM" */
      425,    /* "AES-192-CFB" */
      651,    /* "AES-192-CFB1" */
      654,    /* "AES-192-CFB8" */
@@ -2882,6 +2887,7 @@ static const unsigned int sn_objs[NUM_SN] = {
     1489,    /* "AES-256-CBC-HMAC-SHA1-ETM" */
      950,    /* "AES-256-CBC-HMAC-SHA256" */
     1492,    /* "AES-256-CBC-HMAC-SHA256-ETM" */
+    1495,    /* "AES-256-CBC-HMAC-SHA512-ETM" */
      429,    /* "AES-256-CFB" */
      652,    /* "AES-256-CFB1" */
      655,    /* "AES-256-CFB8" */
@@ -4335,7 +4341,7 @@ static const unsigned int sn_objs[NUM_SN] = {
     1289,    /* "zstd" */
 };
 
-#define NUM_LN 1484
+#define NUM_LN 1487
 static const unsigned int ln_objs[NUM_LN] = {
      363,    /* "AD Time Stamping" */
      405,    /* "ANSI X9.62" */
@@ -4773,6 +4779,7 @@ static const unsigned int ln_objs[NUM_LN] = {
     1487,    /* "aes-128-cbc-hmac-sha1-etm" */
      948,    /* "aes-128-cbc-hmac-sha256" */
     1490,    /* "aes-128-cbc-hmac-sha256-etm" */
+    1493,    /* "aes-128-cbc-hmac-sha512-etm" */
      896,    /* "aes-128-ccm" */
      421,    /* "aes-128-cfb" */
      650,    /* "aes-128-cfb1" */
@@ -4789,6 +4796,7 @@ static const unsigned int ln_objs[NUM_LN] = {
     1488,    /* "aes-192-cbc-hmac-sha1-etm" */
      949,    /* "aes-192-cbc-hmac-sha256" */
     1491,    /* "aes-192-cbc-hmac-sha256-etm" */
+    1494,    /* "aes-192-cbc-hmac-sha512-etm" */
      899,    /* "aes-192-ccm" */
      425,    /* "aes-192-cfb" */
      651,    /* "aes-192-cfb1" */
@@ -4804,6 +4812,7 @@ static const unsigned int ln_objs[NUM_LN] = {
     1489,    /* "aes-256-cbc-hmac-sha1-etm" */
      950,    /* "aes-256-cbc-hmac-sha256" */
     1492,    /* "aes-256-cbc-hmac-sha256-etm" */
+    1495,    /* "aes-256-cbc-hmac-sha512-etm" */
      902,    /* "aes-256-ccm" */
      429,    /* "aes-256-cfb" */
      652,    /* "aes-256-cfb1" */
index b43639311120a3c2c3b850799a65724ddede90c0..15bd8909e8eebb4d4c1fe2c578e028400741555b 100644 (file)
@@ -1490,3 +1490,6 @@ aes_256_cbc_hmac_sha1_etm         1489
 aes_128_cbc_hmac_sha256_etm            1490
 aes_192_cbc_hmac_sha256_etm            1491
 aes_256_cbc_hmac_sha256_etm            1492
+aes_128_cbc_hmac_sha512_etm            1493
+aes_192_cbc_hmac_sha512_etm            1494
+aes_256_cbc_hmac_sha512_etm            1495
index 9c61c4a642a5fbf8c48204fa3144ed681cc5e7e1..6afeefff60876ac662cca3b8c6f9ded4d64004ce 100644 (file)
@@ -1727,6 +1727,9 @@ sm-scheme 104 10        : SM4-XTS             : sm4-xts
                        : AES-128-CBC-HMAC-SHA256-ETM   : aes-128-cbc-hmac-sha256-etm
                        : AES-192-CBC-HMAC-SHA256-ETM   : aes-192-cbc-hmac-sha256-etm
                        : AES-256-CBC-HMAC-SHA256-ETM   : aes-256-cbc-hmac-sha256-etm
+                       : AES-128-CBC-HMAC-SHA512-ETM   : aes-128-cbc-hmac-sha512-etm
+                       : AES-192-CBC-HMAC-SHA512-ETM   : aes-192-cbc-hmac-sha512-etm
+                       : AES-256-CBC-HMAC-SHA512-ETM   : aes-256-cbc-hmac-sha512-etm
 
 ISO-US 10046 2 1       : dhpublicnumber                : X9.42 DH
 
index 34aa74ecb2562f0d0719b0b9786ca255e0cf6b39..bdd51976a7a8a6f6a0a6f7409fa12f547f79159d 100644 (file)
@@ -94,7 +94,7 @@ void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len);
 
 #  if (defined(__arm__) || defined(__arm) || defined(__aarch64__) || defined(_M_ARM64))
 #   include "crypto/arm_arch.h"
-#   if __ARM_MAX_ARCH__>=7
+#   if __ARM_MAX_ARCH__ >= 7
 #    if defined(BSAES_ASM)
 #     define BSAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
 #    endif
@@ -116,6 +116,8 @@ void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len);
                                               (OPENSSL_armcap_P & ARMV8_SHA1))
 #     define HWAES_CBC_HMAC_SHA256_ETM_CAPABLE (HWAES_CAPABLE && \
                                                 (OPENSSL_armcap_P & ARMV8_SHA256))
+#     define HWAES_CBC_HMAC_SHA512_ETM_CAPABLE (HWAES_CAPABLE && \
+                                                (OPENSSL_armcap_P & ARMV8_SHA512))
 #     ifndef __AARCH64EB__
 #      define AES_CBC_HMAC_SHA_ETM_CAPABLE 1
 #     endif
index 0f3d79a889f197a0b64e9e141f3f4288017dff3b..8326216f67f233a63a7a537b4c9888aea6f7ad51 100644 (file)
 #define LN_aes_256_cbc_hmac_sha256_etm          "aes-256-cbc-hmac-sha256-etm"
 #define NID_aes_256_cbc_hmac_sha256_etm         1492
 
+#define SN_aes_128_cbc_hmac_sha512_etm          "AES-128-CBC-HMAC-SHA512-ETM"
+#define LN_aes_128_cbc_hmac_sha512_etm          "aes-128-cbc-hmac-sha512-etm"
+#define NID_aes_128_cbc_hmac_sha512_etm         1493
+
+#define SN_aes_192_cbc_hmac_sha512_etm          "AES-192-CBC-HMAC-SHA512-ETM"
+#define LN_aes_192_cbc_hmac_sha512_etm          "aes-192-cbc-hmac-sha512-etm"
+#define NID_aes_192_cbc_hmac_sha512_etm         1494
+
+#define SN_aes_256_cbc_hmac_sha512_etm          "AES-256-CBC-HMAC-SHA512-ETM"
+#define LN_aes_256_cbc_hmac_sha512_etm          "aes-256-cbc-hmac-sha512-etm"
+#define NID_aes_256_cbc_hmac_sha512_etm         1495
+
 #define SN_dhpublicnumber               "dhpublicnumber"
 #define LN_dhpublicnumber               "X9.42 DH"
 #define NID_dhpublicnumber              920
index 7621b4b1d62703d87ed42ead556ea3b70e61e8e4..f6503572c2a58e8b5d6542c7af6829949c171a53 100644 (file)
@@ -16,6 +16,7 @@ int ossl_cipher_capable_aes_cbc_hmac_sha1(void);
 int ossl_cipher_capable_aes_cbc_hmac_sha256(void);
 int ossl_cipher_capable_aes_cbc_hmac_sha1_etm(void);
 int ossl_cipher_capable_aes_cbc_hmac_sha256_etm(void);
+int ossl_cipher_capable_aes_cbc_hmac_sha512_etm(void);
 
 OSSL_FUNC_provider_get_capabilities_fn ossl_prov_get_capabilities;
 
index 6e33f6ee66b4e39fb08b68d325fc3aaab5149cd4..7e3c354de0f17eae3d8607c631b732c24c854db1 100644 (file)
@@ -226,7 +226,7 @@ static const OSSL_ALGORITHM_CAPABLE deflt_ciphers[] = {
     ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA1, ossl_aes256cbc_hmac_sha1_functions,
          ossl_cipher_capable_aes_cbc_hmac_sha1),
     ALGC(PROV_NAMES_AES_128_CBC_HMAC_SHA256, ossl_aes128cbc_hmac_sha256_functions,
-        ossl_cipher_capable_aes_cbc_hmac_sha256),
+         ossl_cipher_capable_aes_cbc_hmac_sha256),
     ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA256, ossl_aes256cbc_hmac_sha256_functions,
          ossl_cipher_capable_aes_cbc_hmac_sha256),
     ALGC(PROV_NAMES_AES_128_CBC_HMAC_SHA1_ETM, ossl_aes128cbc_hmac_sha1_etm_functions,
@@ -236,11 +236,17 @@ static const OSSL_ALGORITHM_CAPABLE deflt_ciphers[] = {
     ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA1_ETM, ossl_aes256cbc_hmac_sha1_etm_functions,
          ossl_cipher_capable_aes_cbc_hmac_sha1_etm),
     ALGC(PROV_NAMES_AES_128_CBC_HMAC_SHA256_ETM, ossl_aes128cbc_hmac_sha256_etm_functions,
-        ossl_cipher_capable_aes_cbc_hmac_sha256_etm),
+         ossl_cipher_capable_aes_cbc_hmac_sha256_etm),
     ALGC(PROV_NAMES_AES_192_CBC_HMAC_SHA256_ETM, ossl_aes192cbc_hmac_sha256_etm_functions,
-        ossl_cipher_capable_aes_cbc_hmac_sha256_etm),
+         ossl_cipher_capable_aes_cbc_hmac_sha256_etm),
     ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA256_ETM, ossl_aes256cbc_hmac_sha256_etm_functions,
          ossl_cipher_capable_aes_cbc_hmac_sha256_etm),
+    ALGC(PROV_NAMES_AES_128_CBC_HMAC_SHA512_ETM, ossl_aes128cbc_hmac_sha512_etm_functions,
+         ossl_cipher_capable_aes_cbc_hmac_sha512_etm),
+    ALGC(PROV_NAMES_AES_192_CBC_HMAC_SHA512_ETM, ossl_aes192cbc_hmac_sha512_etm_functions,
+         ossl_cipher_capable_aes_cbc_hmac_sha512_etm),
+    ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA512_ETM, ossl_aes256cbc_hmac_sha512_etm_functions,
+         ossl_cipher_capable_aes_cbc_hmac_sha512_etm),
 #ifndef OPENSSL_NO_ARIA
     ALG(PROV_NAMES_ARIA_256_GCM, ossl_aria256gcm_functions),
     ALG(PROV_NAMES_ARIA_192_GCM, ossl_aria192gcm_functions),
index 03258fc97f5ca9a8e232c4aea2851102b8067aef..21032b9ba2e0fcefd643d849cd34e07cbfc3ad45 100644 (file)
@@ -366,6 +366,12 @@ static const OSSL_ALGORITHM_CAPABLE fips_ciphers[] = {
          ossl_cipher_capable_aes_cbc_hmac_sha256_etm),
     ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA256_ETM, ossl_aes256cbc_hmac_sha256_etm_functions,
          ossl_cipher_capable_aes_cbc_hmac_sha256_etm),
+    ALGC(PROV_NAMES_AES_128_CBC_HMAC_SHA512_ETM, ossl_aes128cbc_hmac_sha512_etm_functions,
+         ossl_cipher_capable_aes_cbc_hmac_sha512_etm),
+    ALGC(PROV_NAMES_AES_192_CBC_HMAC_SHA512_ETM, ossl_aes192cbc_hmac_sha512_etm_functions,
+         ossl_cipher_capable_aes_cbc_hmac_sha512_etm),
+    ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA512_ETM, ossl_aes256cbc_hmac_sha512_etm_functions,
+         ossl_cipher_capable_aes_cbc_hmac_sha512_etm),
 #ifndef OPENSSL_NO_DES
     ALG(PROV_NAMES_DES_EDE3_ECB, ossl_tdes_ede3_ecb_functions),
     ALG(PROV_NAMES_DES_EDE3_CBC, ossl_tdes_ede3_cbc_functions),
index 47c140ace110c3b2a916f90915c9922fa2b8faba..e9dbfecfacff62a4cfb6eeb0395324bcbe6d198d 100644 (file)
@@ -108,6 +108,7 @@ SOURCE[$AES_GOAL]=\
         cipher_aes_cbc_hmac_sha_etm.c \
         cipher_aes_cbc_hmac_sha1_etm_hw.c \
         cipher_aes_cbc_hmac_sha256_etm_hw.c \
+        cipher_aes_cbc_hmac_sha512_etm_hw.c \
         cipher_cts.c
 DEFINE[$AES_GOAL]=$AESXTSDEF
 
index 5d164ff5d7123dd7ad8da754446a37b23017fbc1..7d5461ca1c365f6d4a25d6cbc169f6f523dbe256 100644 (file)
@@ -74,7 +74,9 @@ static int hwaes_cbc_hmac_sha1_etm(PROV_CIPHER_CTX *vctx,
 {
     PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx;
     CIPH_DIGEST arg = {0};
+
     ciph_digest_arg_init(&arg, vctx);
+
     if (len % AES_BLOCK_SIZE) {
         ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_INPUT_LENGTH);
         return 0;
@@ -166,8 +168,8 @@ static int aes_cbc_hmac_sha1_cipher(PROV_CIPHER_CTX *vctx,
 
 static const PROV_CIPHER_HW_AES_HMAC_SHA_ETM cipher_hw_aes_hmac_sha1_etm = {
     {
-      aes_cbc_hmac_sha1_init_key,
-      aes_cbc_hmac_sha1_cipher
+        aes_cbc_hmac_sha1_init_key,
+        aes_cbc_hmac_sha1_cipher
     },
     aes_cbc_hmac_sha1_set_mac_key
 };
index 8a5474fc6554da31b383a4e03ffebc8bb620d9b7..95116ae389459fa1f9aaf7726f354b6f032f19ef 100644 (file)
@@ -24,11 +24,11 @@ void sha256_block_data_order(void *c, const void *p, size_t len);
 
 # if defined(__aarch64__)
 int asm_aescbc_sha256_hmac(const uint8_t *csrc, uint8_t *cdst, uint64_t clen,
-                         uint8_t *dsrc, uint8_t *ddst, uint64_t dlen,
-                         CIPH_DIGEST *arg);
+                           uint8_t *dsrc, uint8_t *ddst, uint64_t dlen,
+                           CIPH_DIGEST *arg);
 void asm_sha256_hmac_aescbc_dec(const uint8_t *csrc, uint8_t *cdst, uint64_t clen,
-                              const unsigned char *dsrc, uint8_t *ddst, size_t dlen,
-                              CIPH_DIGEST *arg);
+                                const unsigned char *dsrc, uint8_t *ddst, size_t dlen,
+                                CIPH_DIGEST *arg);
 #  define HWAES128_ENC_CBC_SHA256_ETM asm_aescbc_sha256_hmac
 #  define HWAES128_DEC_CBC_SHA256_ETM asm_sha256_hmac_aescbc_dec
 # endif
@@ -74,7 +74,9 @@ static int hwaes_cbc_hmac_sha256_etm(PROV_CIPHER_CTX *vctx,
 {
     PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx;
     CIPH_DIGEST arg = {0};
+
     ciph_digest_arg_init(&arg, vctx);
+
     if (len % AES_BLOCK_SIZE) {
         ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_INPUT_LENGTH);
         return 0;
@@ -166,8 +168,8 @@ static int aes_cbc_hmac_sha256_cipher(PROV_CIPHER_CTX *vctx,
 
 static const PROV_CIPHER_HW_AES_HMAC_SHA_ETM cipher_hw_aes_hmac_sha256_etm = {
     {
-      aes_cbc_hmac_sha256_init_key,
-      aes_cbc_hmac_sha256_cipher
+        aes_cbc_hmac_sha256_init_key,
+        aes_cbc_hmac_sha256_cipher
     },
     aes_cbc_hmac_sha256_set_mac_key
 };
diff --git a/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha512_etm_hw.c b/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha512_etm_hw.c
new file mode 100644 (file)
index 0000000..5d2a270
--- /dev/null
@@ -0,0 +1,191 @@
+/*
+ * Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * All low level APIs are deprecated for public use, but still ok for internal
+ * use where we're using them to implement the higher level EVP interface, as is
+ * the case here.
+ */
+#include "internal/deprecated.h"
+#include "cipher_aes_cbc_hmac_sha_etm.h"
+
+#if !defined(AES_CBC_HMAC_SHA_ETM_CAPABLE)
+int ossl_cipher_capable_aes_cbc_hmac_sha512_etm(void)
+{
+    return 0;
+}
+
+const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *ossl_prov_cipher_hw_aes_cbc_hmac_sha512_etm(void)
+{
+    return NULL;
+}
+#else
+# if defined(__aarch64__)
+void asm_aescbc_sha512_hmac(const uint8_t *csrc, uint8_t *cdst, uint64_t clen,
+                            uint8_t *dsrc, uint8_t *ddst, uint64_t dlen,
+                            CIPH_DIGEST *arg);
+void asm_sha512_hmac_aescbc_dec(const uint8_t *csrc, uint8_t *cdst, uint64_t clen,
+                                uint8_t *dsrc, uint8_t *ddst, uint64_t dlen,
+                                CIPH_DIGEST *arg);
+#  define HWAES_ENC_CBC_SHA512_ETM asm_aescbc_sha512_hmac
+#  define HWAES_DEC_CBC_SHA512_ETM asm_sha512_hmac_aescbc_dec
+# endif
+
+int ossl_cipher_capable_aes_cbc_hmac_sha512_etm(void)
+{
+    return HWAES_CBC_HMAC_SHA512_ETM_CAPABLE;
+}
+
+static int hwaes_cbc_hmac_sha512_init_key(PROV_CIPHER_CTX *vctx,
+                                          const unsigned char *key,
+                                          size_t keylen)
+{
+    int ret;
+    PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx;
+    PROV_AES_HMAC_SHA512_ETM_CTX *sctx = (PROV_AES_HMAC_SHA512_ETM_CTX *)vctx;
+
+    if (ctx->base.enc)
+        ret = aes_v8_set_encrypt_key(key, ctx->base.keylen * 8, &ctx->ks);
+    else
+        ret = aes_v8_set_decrypt_key(key, ctx->base.keylen * 8, &ctx->ks);
+
+    SHA512_Init(&sctx->head);    /* handy when benchmarking */
+    sctx->tail = sctx->head;
+
+    return ret < 0 ? 0 : 1;
+}
+
+void sha512_block_data_order(void *c, const void *p, size_t len);
+
+static void sha512_update(SHA512_CTX *c, const void *data, size_t len)
+{
+    const unsigned char *ptr = data;
+    size_t res;
+
+    if ((res = c->num)) {
+        res = SHA512_CBLOCK - res;
+        if (len < res)
+            res = len;
+        SHA512_Update(c, ptr, res);
+        ptr += res;
+        len -= res;
+    }
+
+    res = len % SHA512_CBLOCK;
+    len -= res;
+
+    if (len) {
+        sha512_block_data_order(c, ptr, len / SHA512_CBLOCK);
+
+        ptr += len;
+        c->Nh += len >> 61;
+        c->Nl += len <<= 3;
+        if (c->Nl < (unsigned int)len)
+            c->Nh++;
+    }
+
+    if (res)
+        SHA512_Update(c, ptr, res);
+}
+
+static void ciph_digest_arg_init(CIPH_DIGEST *arg, PROV_CIPHER_CTX *vctx)
+{
+    PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx;
+    PROV_AES_HMAC_SHA512_ETM_CTX *sctx = (PROV_AES_HMAC_SHA512_ETM_CTX *)vctx;
+
+    arg->cipher.key = (uint8_t *)&(ctx->ks);
+    arg->cipher.key_rounds = ctx->ks.rounds;
+    arg->cipher.iv = (uint8_t *)&(ctx->base.iv);
+    arg->digest.hmac.i_key_pad = (uint8_t *)&(sctx->head);
+    arg->digest.hmac.o_key_pad = (uint8_t *)&(sctx->tail);
+}
+
+static int hwaes_cbc_hmac_sha512_etm(PROV_CIPHER_CTX *vctx,
+                                     unsigned char *out,
+                                     const unsigned char *in, size_t len)
+{
+    PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx;
+    CIPH_DIGEST arg = {0};
+
+    ciph_digest_arg_init(&arg, vctx);
+
+    if (len % AES_BLOCK_SIZE) {
+        ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_INPUT_LENGTH);
+        return 0;
+    }
+
+    if (ctx->base.enc) {
+        HWAES_ENC_CBC_SHA512_ETM(in, out, len, out, ctx->tag, len, &arg);
+        return 1;
+    } else {
+        if (ctx->taglen == 0) {
+            ERR_raise(ERR_LIB_PROV, PROV_R_TAG_NOT_SET);
+            return 0;
+        }
+        HWAES_DEC_CBC_SHA512_ETM(in, out, len, out, ctx->tag, len, &arg);
+        if (CRYPTO_memcmp(ctx->exp_tag, ctx->tag, ctx->taglen)) {
+            ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_TAG);
+            return 0;
+        }
+        return 1;
+    }
+}
+
+static int hwaes_cbc_hmac_sha512_cipher(PROV_CIPHER_CTX *vctx,
+                                        unsigned char *out,
+                                        const unsigned char *in, size_t len)
+{
+    return hwaes_cbc_hmac_sha512_etm(vctx, out, in, len);
+}
+
+static void hwaes_cbc_hmac_sha512_set_mac_key(void *vctx,
+                                              const unsigned char *mackey,
+                                              size_t len)
+{
+    PROV_AES_HMAC_SHA512_ETM_CTX *ctx = (PROV_AES_HMAC_SHA512_ETM_CTX *)vctx;
+    unsigned int i;
+    unsigned char hmac_key[128];
+
+    memset(hmac_key, 0, sizeof(hmac_key));
+
+    if (len > sizeof(hmac_key)) {
+        SHA512_Init(&ctx->head);
+        sha512_update(&ctx->head, mackey, len);
+        SHA512_Final(hmac_key, &ctx->head);
+    } else {
+        memcpy(hmac_key, mackey, len);
+    }
+
+    for (i = 0; i < sizeof(hmac_key); i++)
+        hmac_key[i] ^= 0x36; /* ipad */
+    SHA512_Init(&ctx->head);
+    sha512_update(&ctx->head, hmac_key, sizeof(hmac_key));
+
+    for (i = 0; i < sizeof(hmac_key); i++)
+        hmac_key[i] ^= 0x36 ^ 0x5c; /* opad */
+    SHA512_Init(&ctx->tail);
+    sha512_update(&ctx->tail, hmac_key, sizeof(hmac_key));
+
+    OPENSSL_cleanse(hmac_key, sizeof(hmac_key));
+}
+
+static const PROV_CIPHER_HW_AES_HMAC_SHA_ETM cipher_hw_aes_hmac_sha512_etm = {
+    {
+     hwaes_cbc_hmac_sha512_init_key,
+     hwaes_cbc_hmac_sha512_cipher
+    },
+    hwaes_cbc_hmac_sha512_set_mac_key
+};
+
+const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *ossl_prov_cipher_hw_aes_cbc_hmac_sha512_etm(void)
+{
+    return &cipher_hw_aes_hmac_sha512_etm;
+}
+
+#endif /* !defined(AES_CBC_HMAC_SHA_CAPABLE) */
index 0292511353d49caeb6134184be2c4edd8d962a51..180cc1078394cae0df85fc1d0430074f810922f8 100644 (file)
@@ -16,7 +16,7 @@
 #ifndef AES_CBC_HMAC_SHA_ETM_CAPABLE
 # define IMPLEMENT_CIPHER(nm, sub, kbits, blkbits, ivbits, flags)              \
 const OSSL_DISPATCH ossl_##nm##kbits##sub##_functions[] = {                    \
-    OSSL_DISPATCH_END                                                              \
+    OSSL_DISPATCH_END                                                          \
 };
 #else
 static OSSL_FUNC_cipher_encrypt_init_fn aes_einit;
@@ -32,7 +32,7 @@ static int aes_set_ctx_params(void *vctx, const OSSL_PARAM params[])
 {
     PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx;
     PROV_CIPHER_HW_AES_HMAC_SHA_ETM *hw =
-       (PROV_CIPHER_HW_AES_HMAC_SHA_ETM *)ctx->hw;
+        (PROV_CIPHER_HW_AES_HMAC_SHA_ETM *)ctx->hw;
     const OSSL_PARAM *p;
 
     if (params == NULL)
@@ -82,8 +82,8 @@ static int aes_set_ctx_params(void *vctx, const OSSL_PARAM params[])
 }
 
 static int aes_einit(void *ctx, const unsigned char *key, size_t keylen,
-                          const unsigned char *iv, size_t ivlen,
-                          const OSSL_PARAM params[])
+                     const unsigned char *iv, size_t ivlen,
+                     const OSSL_PARAM params[])
 {
     if (!ossl_cipher_generic_einit(ctx, key, keylen, iv, ivlen, NULL))
         return 0;
@@ -91,8 +91,8 @@ static int aes_einit(void *ctx, const unsigned char *key, size_t keylen,
 }
 
 static int aes_dinit(void *ctx, const unsigned char *key, size_t keylen,
-                          const unsigned char *iv, size_t ivlen,
-                          const OSSL_PARAM params[])
+                     const unsigned char *iv, size_t ivlen,
+                     const OSSL_PARAM params[])
 {
     if (!ossl_cipher_generic_dinit(ctx, key, keylen, iv, ivlen, NULL))
         return 0;
@@ -259,6 +259,43 @@ static void *aes_cbc_hmac_sha256_etm_dupctx(void *provctx)
     return OPENSSL_memdup(ctx, sizeof(*ctx));
 }
 
+static void *aes_cbc_hmac_sha512_etm_newctx(void *provctx, size_t kbits,
+                                            size_t blkbits, size_t ivbits,
+                                            uint64_t flags)
+{
+    PROV_AES_HMAC_SHA512_ETM_CTX *ctx;
+
+    if (!ossl_prov_is_running())
+        return NULL;
+
+    ctx = OPENSSL_zalloc(sizeof(*ctx));
+    if (ctx != NULL)
+        base_ctx_init(provctx, &ctx->base_ctx,
+                      ossl_prov_cipher_hw_aes_cbc_hmac_sha512_etm(), kbits, blkbits,
+                      ivbits, flags);
+    return ctx;
+}
+
+static void aes_cbc_hmac_sha512_etm_freectx(void *vctx)
+{
+    PROV_AES_HMAC_SHA512_ETM_CTX *ctx = (PROV_AES_HMAC_SHA512_ETM_CTX *)vctx;
+
+    if (ctx != NULL) {
+        ossl_cipher_generic_reset_ctx((PROV_CIPHER_CTX *)vctx);
+        OPENSSL_clear_free(ctx, sizeof(*ctx));
+    }
+}
+
+static void *aes_cbc_hmac_sha512_etm_dupctx(void *provctx)
+{
+    PROV_AES_HMAC_SHA512_ETM_CTX *ctx = provctx;
+
+    if (ctx == NULL)
+        return NULL;
+
+    return OPENSSL_memdup(ctx, sizeof(*ctx));
+}
+
 # define IMPLEMENT_CIPHER(nm, sub, kbits, blkbits, ivbits, flags)              \
 static OSSL_FUNC_cipher_newctx_fn nm##_##kbits##_##sub##_newctx;               \
 static void *nm##_##kbits##_##sub##_newctx(void *provctx)                      \
@@ -307,4 +344,10 @@ IMPLEMENT_CIPHER(aes, cbc_hmac_sha256_etm, 128, 128, 128, EVP_CIPH_FLAG_ENC_THEN
 /* ossl_aes192cbc_hmac_sha256_etm_functions */
 IMPLEMENT_CIPHER(aes, cbc_hmac_sha256_etm, 192, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC)
 /* ossl_aes256cbc_hmac_sha256_etm_functions */
-IMPLEMENT_CIPHER(aes, cbc_hmac_sha256_etm, 256, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC)
\ No newline at end of file
+IMPLEMENT_CIPHER(aes, cbc_hmac_sha256_etm, 256, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC)
+/* ossl_aes128cbc_hmac_sha512_etm_functions */
+IMPLEMENT_CIPHER(aes, cbc_hmac_sha512_etm, 128, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC)
+/* ossl_aes192cbc_hmac_sha512_etm_functions */
+IMPLEMENT_CIPHER(aes, cbc_hmac_sha512_etm, 192, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC)
+/* ossl_aes256cbc_hmac_sha512_etm_functions */
+IMPLEMENT_CIPHER(aes, cbc_hmac_sha512_etm, 256, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC)
index c8b2b1e5ff9e03584eded99002eacb54e8c954ed..460ccbc73667faefe6f9e418d129f2fee46a644b 100644 (file)
@@ -13,6 +13,7 @@
 
 int ossl_cipher_capable_aes_cbc_hmac_sha1_etm(void);
 int ossl_cipher_capable_aes_cbc_hmac_sha256_etm(void);
+int ossl_cipher_capable_aes_cbc_hmac_sha512_etm(void);
 
 typedef struct prov_cipher_hw_aes_hmac_sha_ctx_etm_st {
     PROV_CIPHER_HW base; /* must be first */
@@ -21,12 +22,13 @@ typedef struct prov_cipher_hw_aes_hmac_sha_ctx_etm_st {
 
 const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *ossl_prov_cipher_hw_aes_cbc_hmac_sha1_etm(void);
 const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *ossl_prov_cipher_hw_aes_cbc_hmac_sha256_etm(void);
+const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *ossl_prov_cipher_hw_aes_cbc_hmac_sha512_etm(void);
 
 #ifdef AES_CBC_HMAC_SHA_ETM_CAPABLE
 # include <openssl/aes.h>
 # include <openssl/sha.h>
 
-# define AES_CBC_MAX_HMAC_SIZE 32
+# define AES_CBC_MAX_HMAC_SIZE 64
 
 typedef struct prov_aes_hmac_sha_etm_ctx_st {
     PROV_CIPHER_CTX base;
@@ -47,6 +49,11 @@ typedef struct prov_aes_hmac_sha256_etm_ctx_st {
     SHA256_CTX head, tail;
 } PROV_AES_HMAC_SHA256_ETM_CTX;
 
+typedef struct prov_aes_hmac_sha512_etm_ctx_st {
+    PROV_AES_HMAC_SHA_ETM_CTX base_ctx;
+    SHA512_CTX head, tail, md;
+} PROV_AES_HMAC_SHA512_ETM_CTX;
+
 typedef struct {
     struct {
         uint8_t *key;
index 2b770badc691d25e78d83b36188a4073ec445ee5..311e5f3869335fa96fbce3af91e91ef900632a34 100644 (file)
@@ -104,6 +104,9 @@ extern const OSSL_DISPATCH ossl_aes256cbc_hmac_sha1_etm_functions[];
 extern const OSSL_DISPATCH ossl_aes128cbc_hmac_sha256_etm_functions[];
 extern const OSSL_DISPATCH ossl_aes192cbc_hmac_sha256_etm_functions[];
 extern const OSSL_DISPATCH ossl_aes256cbc_hmac_sha256_etm_functions[];
+extern const OSSL_DISPATCH ossl_aes128cbc_hmac_sha512_etm_functions[];
+extern const OSSL_DISPATCH ossl_aes192cbc_hmac_sha512_etm_functions[];
+extern const OSSL_DISPATCH ossl_aes256cbc_hmac_sha512_etm_functions[];
 
 #ifndef OPENSSL_NO_ARIA
 extern const OSSL_DISPATCH ossl_aria256gcm_functions[];
index 19fdf635c07ad2496aa11ab69947df58f9a4abd1..34e1147b2b76a487d2088d9cd5e3b6730f7b9b02 100644 (file)
 #define PROV_NAMES_AES_128_CBC_HMAC_SHA256_ETM "AES-128-CBC-HMAC-SHA256-ETM"
 #define PROV_NAMES_AES_192_CBC_HMAC_SHA256_ETM "AES-192-CBC-HMAC-SHA256-ETM"
 #define PROV_NAMES_AES_256_CBC_HMAC_SHA256_ETM "AES-256-CBC-HMAC-SHA256-ETM"
+#define PROV_NAMES_AES_128_CBC_HMAC_SHA512_ETM "AES-128-CBC-HMAC-SHA512-ETM"
+#define PROV_NAMES_AES_192_CBC_HMAC_SHA512_ETM "AES-192-CBC-HMAC-SHA512-ETM"
+#define PROV_NAMES_AES_256_CBC_HMAC_SHA512_ETM "AES-256-CBC-HMAC-SHA512-ETM"
 
 /*-
  * Digests
index ae03e589a03e4401a45f9698263d37cca88a9dcf..706460170b98302a5581b2c5eb736de65e21d236 100644 (file)
@@ -488,7 +488,7 @@ static int test_cipher_reinit_partialupdate(int test_id)
     /* skip any ciphers that don't allow partial updates */
     if (((EVP_CIPHER_get_flags(cipher)
           & (EVP_CIPH_FLAG_CTS | EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK |
-          EVP_CIPH_FLAG_ENC_THEN_MAC)) != 0)
+             EVP_CIPH_FLAG_ENC_THEN_MAC)) != 0)
         || EVP_CIPHER_get_mode(cipher) == EVP_CIPH_CCM_MODE
         || EVP_CIPHER_get_mode(cipher) == EVP_CIPH_XTS_MODE
         || EVP_CIPHER_get_mode(cipher) == EVP_CIPH_WRAP_MODE) {
index f45b7d816621d755d1ac14c6e633ce3da902aedc..db5271385e5f63cb3da1a6a7bd7f4b58a4817cb7 100644 (file)
@@ -1346,13 +1346,13 @@ static int cipher_test_enc(EVP_TEST *t, int enc, size_t out_misalign,
     } else if (!enc && (expected->aead == EVP_CIPH_OCB_MODE
                         || expected->tag_late)) {
         if (EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_AEAD_SET_TAG,
-                                 expected->tag_len, expected->tag) <= 0) {
+                                expected->tag_len, expected->tag) <= 0) {
             t->err = "TAG_SET_ERROR";
             goto err;
         }
     } else if (!enc && expected->mac_key && expected->tag) {
-       if (EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_AEAD_SET_TAG,
-                                 expected->tag_len, expected->tag) <= 0) {
+        if (EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_AEAD_SET_TAG,
+                                expected->tag_len, expected->tag) <= 0) {
             t->err = "TAG_SET_ERROR";
             goto err;
         }
@@ -1453,25 +1453,32 @@ static int cipher_test_enc(EVP_TEST *t, int enc, size_t out_misalign,
     if (enc && expected->tag) {
         if (EVP_CIPHER_is_a(expected->cipher, "AES-128-CBC-HMAC-SHA1-ETM")
             || EVP_CIPHER_is_a(expected->cipher, "AES-128-CBC-HMAC-SHA256-ETM")
+            || EVP_CIPHER_is_a(expected->cipher, "AES-128-CBC-HMAC-SHA512-ETM")
             || EVP_CIPHER_is_a(expected->cipher, "AES-192-CBC-HMAC-SHA1-ETM")
             || EVP_CIPHER_is_a(expected->cipher, "AES-192-CBC-HMAC-SHA256-ETM")
+            || EVP_CIPHER_is_a(expected->cipher, "AES-192-CBC-HMAC-SHA512-ETM")
             || EVP_CIPHER_is_a(expected->cipher, "AES-256-CBC-HMAC-SHA1-ETM")
-            || EVP_CIPHER_is_a(expected->cipher, "AES-256-CBC-HMAC-SHA256-ETM")) {
-            unsigned char rtag[32] = {0};
+            || EVP_CIPHER_is_a(expected->cipher, "AES-256-CBC-HMAC-SHA256-ETM")
+            || EVP_CIPHER_is_a(expected->cipher, "AES-256-CBC-HMAC-SHA512-ETM")) {
+            unsigned char rtag[64] = {0};
             unsigned tag_len = 0;
             OSSL_PARAM params[2];
 
             if (EVP_CIPHER_is_a(expected->cipher, "AES-128-CBC-HMAC-SHA1-ETM")
                 || EVP_CIPHER_is_a(expected->cipher, "AES-192-CBC-HMAC-SHA1-ETM")
-                || EVP_CIPHER_is_a(expected->cipher, "AES-256-CBC-HMAC-SHA1-ETM")) {
+                || EVP_CIPHER_is_a(expected->cipher, "AES-256-CBC-HMAC-SHA1-ETM"))
                 tag_len = 20;
-            else if (EVP_CIPHER_is_a(expected->cipher, "AES-128-CBC-HMAC-SHA256-ETM")
-                       || EVP_CIPHER_is_a(expected->cipher, "AES-192-CBC-HMAC-SHA256-ETM")
-                       || EVP_CIPHER_is_a(expected->cipher, "AES-256-CBC-HMAC-SHA256-ETM")) {
+            else if (EVP_CIPHER_is_a(expected->cipher, "AES-128-CBC-HMAC-SHA256-ETM")
+                     || EVP_CIPHER_is_a(expected->cipher, "AES-192-CBC-HMAC-SHA256-ETM")
+                     || EVP_CIPHER_is_a(expected->cipher, "AES-256-CBC-HMAC-SHA256-ETM"))
                 tag_len = 32;
-            }
+            else if (EVP_CIPHER_is_a(expected->cipher, "AES-128-CBC-HMAC-SHA512-ETM")
+                     || EVP_CIPHER_is_a(expected->cipher, "AES-192-CBC-HMAC-SHA512-ETM")
+                     || EVP_CIPHER_is_a(expected->cipher, "AES-256-CBC-HMAC-SHA512-ETM"))
+                tag_len = 64;
 
-            if (!TEST_size_t_le(expected->tag_len, tag_len)) {
+            if (!TEST_size_t_le(expected->tag_len, tag_len) ||
+                !TEST_size_t_le(tag_len, sizeof(rtag))) {
                 t->err = "TAG_LENGTH_INTERNAL_ERROR";
                 goto err;
             }
index 06da481bb5b80e8453fa643df682bc67370ddbfe..050ce9b419c28b94c27d7600706a02b896854b62 100644 (file)
@@ -229,3 +229,56 @@ Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f2021
 Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0ce83c1ff5f2680dc57ffd8eefdbb9c00d3d507672d105a990b2b78509978625b9d93c2bd41e3fb721abd1496553c583c67dad9b662b3d58c8540e10ed9c5ed1a7f33ce9e9a41c30836651d73ee2c003af03a919eb41a6d70ef814e184e740f8a4ca75016ae77ac335ba758396232a87ffceacf24a0e287371eaa04570cb68dcd61882e1c3f7aca38afed34138fedefe167bb9c741ebd14da2eba3cf5b9aa06bb93ca61fa462de7e1f439efac5ea55edab61171250be36da513e6b5f92c8267f778cdde5720128a586c7bbd5864686b12710daa9f133706e81fa3a066bd1f29277c08ca8f052b3ed06f04ec2a8509f54934fd9b06f4115e546011ff485ac76d5fce0329c94bf5f29726bed49ace94abf53b036c1f920f8c71d44deca7b11f653025698425717bb3cc8f5e74230d8ede675ee0eae6f8aae274152c7503c567427a71323feb84b0fc0515030c933e4c7399be13322b5d4ccabb97c011d75de82f38a540e972bc2a515dc31d50e78b74be891cc4a2ddbe4b50d0d27c069985a581b80a9f591a4bb198f085af2138ca9b4f595c37d60f15d960b1e39de7ff92a699d9aca4a44ff9d327c7130e6b0ce90032e358f3743d8abccaeb0426226d6ec233fdf289bdde5f3b2756a587a382e3353d77acb9774bd64978629633f2122d1fa376b12cfbe4781d6a35227d71fdfa929c1435596fbaf7fe0aea4fa02c6b9e8099c62149ed82819a2088b72660be8ea364c13d5340be93cab8ac92914d2b1115cbb7
 Tag = 8cb8898a5b559984da3cbaa4703c9ed3cfc2f56c7292a3279a3dd5f7475412e1
 
+Title = AES-128-CBC-HMAC-SHA512-ETM test vectors
+
+Cipher = AES-128-CBC-HMAC-SHA512-ETM
+Key = feffe9928665731c6d6a8f9467308308
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f
+Ciphertext = 18bd54842828fdc0ac5a3b459f32f0be
+Tag = 75c1883b2a1b71b98d04a0fc46b91b7e5d6e12c23a8e19a914d88be9a1d8a9f77022bff6144dfba69764565606856bf0f2510fef52bc4aa3a5b9089975a0400a
+
+Cipher = AES-128-CBC-HMAC-SHA512-ETM
+Key = feffe9928665731c6d6a8f9467308308
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f
+Ciphertext = 18bd54842828fdc0ac5a3b459f32f0be305a77944b17f62fedd4442ae60a0b0a3e1c2c23c584c86877fbd9997b415959254ea06ef046dc2e1fdafe7950a77ba94494683e01a0c495dc223a2de73be1474bcdf0b104a89ca6d419254e8f602334158d188f748c5cf4b7473c7475b4cf6c
+Tag = 60ddf92cf1ed62ca3213cda9a497fbbd1f54c3a10177a9ccc3c8282dc58800edf5f710b08413fe1eb422a4efc77b97ef1a87da44bb7e8547c5364200f9ee48fc
+
+Cipher = AES-128-CBC-HMAC-SHA512-ETM
+Key = feffe9928665731c6d6a8f9467308308
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f
+Ciphertext = 18bd54842828fdc0ac5a3b459f32f0be305a77944b17f62fedd4442ae60a0b0a3e1c2c23c584c86877fbd9997b415959254ea06ef046dc2e1fdafe7950a77ba94494683e01a0c495dc223a2de73be1474bcdf0b104a89ca6d419254e8f602334158d188f748c5cf4b7473c7475b4cf6c3bfadb50a6126c4fe31d52606b97f347a9d6722a458cc2afdd895c3a247d11e551398180bc445b0ea94d17a1a441fb10b86d84a7549e03b6edf1a12591c63dfa167f2f11ea12b2d3d8f62d92be9238d1e6eed2099f3d0f9e1fe541618bbda588899002c3078202a2d138942c4325b673e494b310a502cda70e8f62480776c31068cb3d2f4c250b9e65669d950b1a4d50cf5f2b11c74960347885e8dbb89d58f24871c34f1a134b1873222b24a310f8bb3299ca1d16cb1921c97fb462e3150b57909ec7d376e93e52ea9e51094f22f11273c32403c82acebf575b7b7af7c98976adf6f4bd4199bd9201fa7321aaad828bfcc3785776f959484ff013d8a66d579af036a6c0e82d94e6eb773f6124f18da5ca4cf5b70f72e9d852766af78269d36a03eb2e2cdda79f16c0f81be27b6593c3f4e9d19cb7018a7e4ca74756dd66ac1b45a4d741e0431d120a7f84dbbc4d7d478b54464050e62d8da0c856ccbc2dcd4dec4aa4d554ac4cce8fbeca8ba4efb55a25771f425a6e5bd74c35972c3da41eeee7fb36b5075e5ab3115f7424f0dab05a085185e923d9ad3e74dc16ff2ecfe03afdf34ba17babafc65aa87600c632ccdcbcc1b591d723eb37a8a3f869cce9fe41
+Tag = ba428ce5296789f9859e377c8c959c6a2b29c5be296a8b0e505b38712d344df1a8ab3da8ca46cc85767414feafd607dd8e3d6707946cc955bcd7707ae74dab89
+
+
+Title = AES-256-CBC-HMAC-SHA512-ETM test vectors
+
+Cipher = AES-256-CBC-HMAC-SHA512-ETM
+Key = 6cc028952fa7c1ee09fc78b7549ae04d79b54d40ec172333e3a4a2297b62afe5
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f
+Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0
+Tag = f3e5750ecd6bf757d180ce8a920b86900773f7801014dc57d77e52501bcb657cdc70784ca83f7235b77b6fd0cbdfc374bdf6217a2ebdf426746dfb7fdc458ce9
+
+Cipher = AES-256-CBC-HMAC-SHA512-ETM
+Key = 6cc028952fa7c1ee09fc78b7549ae04d79b54d40ec172333e3a4a2297b62afe5
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f
+Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0ce83c1ff5f2680dc57ffd8eefdbb9c00d3d507672d105a990b2b78509978625b9d93c2bd41e3fb721abd1496553c583c67dad9b662b3d58c8540e10ed9c5ed1a7f33ce9e9a41c30836651d73ee2c003af03a919eb41a6d70ef814e184e740f8a
+Tag = e6cf78e7e21042a2cf9d0b835c3b1dafc30f3414811b2990fedb5ee47d72c5a3e52daa33c8abafeeace77495e5fd514ab9acdc793ed8b0699fb122bfb45d7d39
+
+Cipher = AES-256-CBC-HMAC-SHA512-ETM
+Key = 6cc028952fa7c1ee09fc78b7549ae04d79b54d40ec172333e3a4a2297b62afe5
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f
+Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0ce83c1ff5f2680dc57ffd8eefdbb9c00d3d507672d105a990b2b78509978625b9d93c2bd41e3fb721abd1496553c583c67dad9b662b3d58c8540e10ed9c5ed1a7f33ce9e9a41c30836651d73ee2c003af03a919eb41a6d70ef814e184e740f8a4ca75016ae77ac335ba758396232a87ffceacf24a0e287371eaa04570cb68dcd61882e1c3f7aca38afed34138fedefe167bb9c741ebd14da2eba3cf5b9aa06bb93ca61fa462de7e1f439efac5ea55edab61171250be36da513e6b5f92c8267f778cdde5720128a586c7bbd5864686b12710daa9f133706e81fa3a066bd1f29277c08ca8f052b3ed06f04ec2a8509f54934fd9b06f4115e546011ff485ac76d5fce0329c94bf5f29726bed49ace94abf53b036c1f920f8c71d44deca7b11f653025698425717bb3cc8f5e74230d8ede675ee0eae6f8aae274152c7503c567427a71323feb84b0fc0515030c933e4c7399be13322b5d4ccabb97c011d75de82f38a540e972bc2a515dc31d50e78b74be891cc4a2ddbe4b50d0d27c069985a581b80a9f591a4bb198f085af2138ca9b4f595c37d60f15d960b1e39de7ff92a699d9aca4a44ff9d327c7130e6b0ce90032e358f3743d8abccaeb0426226d6ec233fdf289bdde5f3b2756a587a382e3353d77acb9774bd64978629633f2122d1fa376b12cfbe4781d6a35227d71fdfa929c1435596fbaf7fe0aea4fa02c6b9e8099c62149ed82819a2088b72660be8ea364c13d5340be93cab8ac92914d2b1115cbb7
+Tag = 6ed97bd77ea4cf480dba39cdcc10601837ddd72c00bf7937855f94514d6ebad1be1cf786f815a95d2a19889f4e4442f0adf91a1621b6ab57727b65ce243ded1a
+