]> git.ipfire.org Git - thirdparty/openssl.git/commitdiff
Implement interleaving aes-cbc-hmac-sha on aarch64
authorfangming.fang <fangming.fang@arm.com>
Wed, 17 Jan 2024 10:48:55 +0000 (10:48 +0000)
committerMatt Caswell <matt@openssl.org>
Mon, 14 Apr 2025 13:53:30 +0000 (14:53 +0100)
This is to implement #19932, it adds enc-then-mac aes-cbc-hmac-sha1/256,
aes-cbc and hmac-sha1/256 are interleaved to achieve better performance.
It only supports non-padding mode that means the length of input data
should be multiple of 16 bytes.

Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com>
(Merged from https://github.com/openssl/openssl/pull/22949)

27 files changed:
apps/enc.c
apps/lib/opt.c
crypto/aes/asm/aes-sha1-armv8.pl [new file with mode: 0644]
crypto/aes/asm/aes-sha256-armv8.pl [new file with mode: 0644]
crypto/aes/build.info
crypto/evp/evp_lib.c
crypto/objects/obj_dat.h
crypto/objects/obj_mac.num
crypto/objects/objects.txt
include/crypto/aes_platform.h
include/openssl/evp.h
include/openssl/obj_mac.h
providers/common/include/prov/providercommon.h
providers/defltprov.c
providers/fips/fipsprov.c
providers/implementations/ciphers/build.info
providers/implementations/ciphers/cipher_aes_cbc_hmac_sha1_etm_hw.c [new file with mode: 0644]
providers/implementations/ciphers/cipher_aes_cbc_hmac_sha256_etm_hw.c [new file with mode: 0644]
providers/implementations/ciphers/cipher_aes_cbc_hmac_sha_etm.c [new file with mode: 0644]
providers/implementations/ciphers/cipher_aes_cbc_hmac_sha_etm.h [new file with mode: 0644]
providers/implementations/ciphers/ciphercommon.c
providers/implementations/include/prov/implementations.h
providers/implementations/include/prov/names.h
test/evp_libctx_test.c
test/evp_test.c
test/recipes/30-test_evp_data/evpciph_aes_stitched.txt
util/perl/OpenSSL/paramnames.pm

index 3f45ba15e576d1befadbb83e0e65879343008701..cf26ac0a7d5adce3228a6db6766d2c4487bd6810 100644 (file)
@@ -807,6 +807,7 @@ static void show_ciphers(const OBJ_NAME *name, void *arg)
     cipher = EVP_get_cipherbyname(name->name);
     if (cipher == NULL
             || (EVP_CIPHER_get_flags(cipher) & EVP_CIPH_FLAG_AEAD_CIPHER) != 0
+            || (EVP_CIPHER_get_flags(cipher) & EVP_CIPH_FLAG_ENC_THEN_MAC) != 0
             || EVP_CIPHER_get_mode(cipher) == EVP_CIPH_XTS_MODE)
         return;
 
index 0018c268c0b44da755fad8eb9ce3a7e84157356d..39276d828c875ad289f6ba7e2afc3ff0b02f3a56 100644 (file)
@@ -437,6 +437,8 @@ int opt_cipher(const char *name, EVP_CIPHER **cipherp)
             opt_printf_stderr("%s XTS ciphers not supported\n", prog);
         } else if ((flags & EVP_CIPH_FLAG_AEAD_CIPHER) != 0) {
             opt_printf_stderr("%s: AEAD ciphers not supported\n", prog);
+        } else if ((flags & EVP_CIPH_FLAG_ENC_THEN_MAC) != 0) {
+            opt_printf_stderr("%s: ENC-then-MAC cipher not supported\n", prog);
         } else {
             ret = 1;
             if (cipherp != NULL)
diff --git a/crypto/aes/asm/aes-sha1-armv8.pl b/crypto/aes/asm/aes-sha1-armv8.pl
new file mode 100644 (file)
index 0000000..8f8505d
--- /dev/null
@@ -0,0 +1,4312 @@
+#! /usr/bin/env perl
+
+# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (C) Cavium networks Ltd. 2016.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#========================================================================
+# Derived from following files in
+# https://github.com/ARM-software/AArch64cryptolib
+# AArch64cryptolib_opt_big/aes_cbc_sha1/aes128cbc_sha1_hmac.S
+# AArch64cryptolib_opt_big/aes_cbc_sha1/sha1_hmac_aes128cbc_dec.S
+#========================================================================
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+$code=<<___;
+#include "arm_arch.h"
+
+# Theses are offsets into the CIPH_DIGEST struct
+#define CIPHER_KEY     0
+#define CIPHER_KEY_ROUNDS      8
+#define CIPHER_IV      16
+#define HMAC_IKEYPAD   24
+#define HMAC_OKEYPAD   32
+
+.text
+.arch armv8-a+crypto
+___
+
+sub aes192_aes256_handle () {
+       my $compare = shift;
+       my $label = shift;
+       my $i = shift;
+       my $load_rk10 = shift;
+
+       if($compare == 1) {
+$code.=<<___;
+       cmp     x16,#12
+___
+       }
+$code.=<<___;
+       b.lt    .Laes128_${label}_$i
+.Laes192_${label}_$i:
+       ldp     q30,q31,[x17],32        /* rk[10],rk[11] */
+       aese    v$i.16b,v17.16b
+       aesmc   v$i.16b,v$i.16b
+       aese    v$i.16b,v30.16b
+       aesmc   v$i.16b,v$i.16b
+       b.gt    .Laes256_${label}_$i
+       ld1     {v30.16b},[x17]         /* rk[12] */
+       aese    v$i.16b,v31.16b
+       eor     v$i.16b,v$i.16b,v30.16b
+       sub     x17, x17, #32           /* rewind x17 */
+       b       1f
+.Laes256_${label}_$i:
+       aese    v$i.16b,v31.16b
+       aesmc   v$i.16b,v$i.16b
+       ldp     q30,q31,[x17],32        /* rk[12],rk[13] */
+       aese    v$i.16b,v30.16b
+       aesmc   v$i.16b,v$i.16b
+       ld1     {v30.16b},[x17]         /* rk[14] */
+       aese    v$i.16b,v31.16b
+       eor     v$i.16b,v$i.16b,v30.16b
+       sub     x17, x17, #64           /* rewind x17 */
+       b       1f
+.Laes128_${label}_$i:
+___
+       if ($load_rk10 == 1) {
+$code.=<<___;
+       ld1     {v18.16b},[x9]
+___
+       }
+$code.=<<___;
+       aese    v$i.16b,v17.16b
+       eor     v$i.16b,v$i.16b,v18.16b /* res 0 */
+1:
+___
+}
+
+sub aes192_aes256_dec_handle () {
+       my $compare = shift;
+       my $label = shift;
+       my $i = shift;
+       my $load_rk10 = shift;
+
+       if($compare == 1) {
+$code.=<<___;
+       cmp     x16,#12
+___
+       }
+$code.=<<___;
+       b.lt    .Laes128_${label}_$i
+.Laes192_${label}_$i:
+       stp     q19,q23,[sp, #-32]!
+       ld1     {v19.16b},[x17],16      /* rk[10] */
+       ld1     {v23.16b},[x17],16      /* rk[11] */
+       aesd    v$i.16b,v17.16b
+       aesimc  v$i.16b,v$i.16b
+       aesd    v$i.16b,v19.16b
+       aesimc  v$i.16b,v$i.16b
+       b.gt    .Laes256_${label}_$i
+       ld1     {v19.16b},[x17]         /* rk[12] */
+       aesd    v$i.16b,v23.16b
+       eor     v$i.16b,v$i.16b,v19.16b
+       sub     x17, x17, #32           /* rewind x17 */
+       ldp     q19,q23,[sp], #32
+       b       1f
+.Laes256_${label}_$i:
+       aesd    v$i.16b,v23.16b
+       aesimc  v$i.16b,v$i.16b
+       ld1     {v19.16b},[x17],16      /* rk[12] */
+       ld1     {v23.16b},[x17],16      /* rk[13] */
+       aesd    v$i.16b,v19.16b
+       aesimc  v$i.16b,v$i.16b
+       ld1     {v19.16b},[x17]         /* rk[14] */
+       aesd    v$i.16b,v23.16b
+       eor     v$i.16b,v$i.16b,v19.16b
+       sub     x17, x17, #64           /* rewind x17 */
+       ldp     q19,q23,[sp], #32
+       b       1f
+.Laes128_${label}_$i:
+___
+       if ($load_rk10 == 1) {
+$code.=<<___;
+       ld1     {v18.16b},[x9]
+___
+       }
+$code.=<<___;
+       aesd    v$i.16b,v17.16b
+       eor     v$i.16b,v$i.16b,v18.16b /* res 0 */
+1:
+___
+}
+
+$code.=<<___;
+# Description:
+#
+# Combined Enc/Auth Primitive = aes128cbc/sha1_hmac
+#
+# Operations:
+#
+# out = encrypt-AES128CBC(in)
+# return_hash_ptr = SHA1(o_key_pad | SHA1(i_key_pad | out))
+#
+# Prototype:
+# int asm_aescbc_sha1_hmac(uint8_t *csrc, uint8_t *cdst, uint64_t clen,
+#                      uint8_t *dsrc, uint8_t *ddst, uint64_t dlen,
+#                      CIPH_DIGEST *arg)
+#
+# Registers used:
+#
+# asm_aescbc_sha1_hmac(
+#      csrc,   x0      (cipher src address)
+#      cdst,   x1      (cipher dst address)
+#      clen    x2      (cipher length)
+#      dsrc,   x3      (digest src address)
+#      ddst,   x4      (digest dst address)
+#      dlen,   x5      (digest length)
+#      arg     x6:
+#              arg->cipher.key                 (round keys)
+#              arg->cipher.key_rounds          (key rounds)
+#              arg->cipher.iv                  (initialization vector)
+#              arg->digest.hmac.i_key_pad      (partially hashed i_key_pad)
+#              arg->digest.hmac.o_key_pad      (partially hashed o_key_pad)
+#      )
+#
+# Routine register definitions:
+#
+# v0 - v3 -- aes results
+# v4 - v7 -- round consts for sha
+# v8 - v18 -- round keys
+# v19 -- temp register for SHA1
+# v20 -- ABCD copy (q20)
+# v21 -- sha working state (q21)
+# v22 -- sha working state (q22)
+# v23 -- temp register for SHA1
+# v24 -- sha state ABCD
+# v25 -- sha state E
+# v26 -- sha block 0
+# v27 -- sha block 1
+# v28 -- sha block 2
+# v29 -- sha block 3
+# v30 -- reserved
+# v31 -- reserved
+#
+# Constraints:
+#
+# The variable "clen" must be a multiple of 16, otherwise results are not
+# defined. For AES partial blocks the user is required to pad the input
+# to modulus 16 = 0.
+# The variable "dlen" must be a multiple of 8 and greater or equal
+# to "clen". This constraint is strictly related to the needs of the IPSec
+# ESP packet. Encrypted payload is hashed along with the 8 byte ESP header,
+# forming ICV. Speed gain is achieved by doing both things at the same time,
+# hence lengths are required to match at least at the cipher level.
+#
+# Short lengths are not optimized at < 12 AES blocks
+
+.global asm_aescbc_sha1_hmac
+.type  asm_aescbc_sha1_hmac,%function
+
+.align 4
+.Lrcon:
+       .word   0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
+       .word   0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
+       .word   0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
+       .word   0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
+
+asm_aescbc_sha1_hmac:
+       AARCH64_VALID_CALL_TARGET
+       /* protect registers */
+       stp             d8,d9,[sp,#-64]!
+       /* fetch args */
+       ldr             x7, [x6, #HMAC_IKEYPAD]
+       /* init ABCD, E */
+       ldr             q24, [x7]
+       eor             v25.16b, v25.16b, v25.16b
+       ldr             s25, [x7, #16]
+       /* save pointer to o_key_pad partial hash */
+       ldr             x7, [x6, #HMAC_OKEYPAD]
+
+       stp             d10,d11,[sp,#16]
+
+       prfm            PLDL1KEEP,[x0,0]        /* pref next aes_ptr_in */
+       prfm            PLDL1KEEP,[x1,0]        /* pref next aes_ptr_out */
+       lsr             x10,x2,4                /* aes_blocks = len/16 */
+
+       stp             d12,d13,[sp,#32]
+       stp             d14,d15,[sp,#48]
+
+       ldr             x9, [x6, #CIPHER_KEY]
+       ldr             x16, [x6, #CIPHER_KEY_ROUNDS]
+       ldr             x6, [x6, #CIPHER_IV]
+       add             x17, x9, #160           /* point to the last 5 rounds keys */
+
+       /*
+        * init sha state, prefetch, check for small cases.
+        * Note that the output is prefetched as a load, for the in-place case
+       */
+       cmp             x10,12                  /* no main loop if <12 */
+       b.lt            .Lenc_short_cases       /* branch if < 12 */
+
+       /* proceed */
+       ld1             {v3.16b},[x6]           /* get 1st ivec */
+       /* read first aes block, bump aes_ptr_in */
+       ld1             {v0.16b},[x0],16
+       mov             x11,x2                  /* len -> x11 needed at end */
+       lsr             x12,x11,6               /* total_blocks */
+       /*
+        * now we can do the loop prolog, 1st aes sequence of 4 blocks
+        */
+       ldp             q8,q9,[x9],32           /* rk[0],rk[1] */
+       eor             v0.16b,v0.16b,v3.16b    /* xor w/ ivec (modeop) */
+
+       /* aes xform 0 */
+       aese            v0.16b,v8.16b
+       aesmc           v0.16b,v0.16b
+       ldp             q10,q11,[x9],32         /* rk[2],rk[3] */
+       prfm            PLDL1KEEP,[x0,64]       /* pref next aes_ptr_in */
+       /* base address for sha round consts */
+       adr             x8,.Lrcon
+       aese            v0.16b,v9.16b
+       aesmc           v0.16b,v0.16b
+       prfm            PLDL1KEEP,[x1,64]       /* pref next aes_ptr_out  */
+       ldp             q12,q13,[x9],32         /* rk[4],rk[5] */
+       aese            v0.16b,v10.16b
+       aesmc           v0.16b,v0.16b
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v1.16b},[x0],16
+       aese            v0.16b,v11.16b
+       aesmc           v0.16b,v0.16b
+       ldp             q14,q15,[x9],32         /* rk[6],rk[7] */
+       aese            v0.16b,v12.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v13.16b
+       aesmc           v0.16b,v0.16b
+       ldp             q16,q17,[x9],32         /* rk[8],rk[9] */
+       aese            v0.16b,v14.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v15.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v16.16b
+       aesmc           v0.16b,v0.16b
+___
+       &aes192_aes256_handle(1, "enc_prolog", 0, 1);
+$code.=<<___;
+       eor             v1.16b,v1.16b,v0.16b    /* xor w/ ivec (modeop) */
+
+       /* aes xform 1 */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v2.16b},[x0],16
+       aese            v1.16b,v8.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v9.16b
+       aesmc           v1.16b,v1.16b
+       prfm            PLDL1KEEP,[x8,0*64]     /* rcon */
+       aese            v1.16b,v10.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v11.16b
+       aesmc           v1.16b,v1.16b
+       /* save aes res, bump aes_out_ptr */
+       st1             {v0.16b},[x1],16
+       ld1             {v26.16b},[x3],16
+       prfm            PLDL1KEEP,[x8,2*64]     /* rcon */
+       aese            v1.16b,v12.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v13.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v14.16b
+       aesmc           v1.16b,v1.16b
+       prfm            PLDL1KEEP,[x8,4*64]     /* rcon */
+       aese            v1.16b,v15.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v16.16b
+       aesmc           v1.16b,v1.16b
+       prfm            PLDL1KEEP,[x8,6*64]     /* rcon */
+___
+       &aes192_aes256_handle(0, "enc_prolog", 1, 0);
+$code.=<<___;
+       prfm            PLDL1KEEP,[x8,8*64]     /* rcon */
+       eor             v2.16b,v2.16b,v1.16b    /* xor w/ivec (modeop) */
+
+       /* aes xform 2 */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v3.16b},[x0],16
+       aese            v2.16b,v8.16b
+       aesmc           v2.16b,v2.16b
+       mov             x9,x0                   /* lead_ptr = aes_ptr_in */
+       aese            v2.16b,v9.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v10.16b
+       aesmc           v2.16b,v2.16b
+       prfm            PLDL1KEEP,[x8,10*64]    /* rcon */
+       aese            v2.16b,v11.16b
+       aesmc           v2.16b,v2.16b
+       /* save aes res, bump aes_out_ptr */
+       st1             {v1.16b},[x1],16
+       ld1             {v27.16b},[x3],16
+       aese            v2.16b,v12.16b
+       aesmc           v2.16b,v2.16b
+       prfm            PLDL1KEEP,[x8,12*64]    /* rcon */
+       aese            v2.16b,v13.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v14.16b
+       aesmc           v2.16b,v2.16b
+       prfm            PLDL1KEEP,[x8,14*64]    /* rcon */
+       aese            v2.16b,v15.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v16.16b
+       aesmc           v2.16b,v2.16b
+___
+       &aes192_aes256_handle(0, "enc_prolog", 2, 0);
+$code.=<<___;
+       eor             v3.16b,v3.16b,v2.16b    /* xor w/ ivec (modeop) */
+
+       /* aes xform 3 */
+       aese            v3.16b,v8.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v9.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v10.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v11.16b
+       aesmc           v3.16b,v3.16b
+       /* save aes res, bump aes_out_ptr */
+       st1             {v2.16b},[x1],16
+       ld1             {v28.16b},[x3],16
+       aese            v3.16b,v12.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v13.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v14.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v15.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v16.16b
+       aesmc           v3.16b,v3.16b
+       /* main_blocks = total_blocks - 1 */
+       sub             x15,x12,1
+       and             x13,x10,3               /* aes_blocks_left */
+___
+       &aes192_aes256_handle(0, "enc_prolog", 3, 0);
+$code.=<<___;
+    ldp                q4,q5,[x8],32           /* key0,key1 */
+       /*
+        * Note, aes_blocks_left := number after
+        * the main (sha) block is done. Can be 0
+        */
+
+       /* save aes res, bump aes_out_ptr */
+       st1             {v3.16b},[x1],16
+       ld1             {v29.16b},[x3],16
+
+       ldp             q6,q7,[x8]              /* key2,key3 */
+
+       /* get outstanding bytes of the digest */
+       sub             x8,x5,x2
+       /* substract loaded bytes */
+       sub             x5,x5,64
+       /*
+        * main combined loop CBC
+        */
+.Lenc_main_loop:
+       /*
+        * because both mov, rev32 and eor have a busy cycle, this takes longer
+        * than it looks.
+        * That's OK since there are 6 cycles before we can use the load anyway;
+        * so this goes as fast as it can without SW pipelining (too complicated
+        * given the code size)
+        */
+       rev32           v26.16b,v26.16b
+       /* next aes block, update aes_ptr_in */
+       ld1             {v0.16b},[x0],16
+       mov             v20.16b,v24.16b         /* working ABCD <- ABCD */
+       prfm            PLDL1KEEP,[x9,64]       /* pref next lead_ptr */
+       rev32           v27.16b,v27.16b
+       /* pref next aes_ptr_out, streaming  */
+       prfm            PLDL1KEEP,[x1,64]
+       eor             v0.16b,v0.16b,v3.16b    /* xor w/ prev value */
+
+       /* aes xform 0, sha quad 0 */
+       aese            v0.16b,v8.16b
+       aesmc           v0.16b,v0.16b
+       rev32           v28.16b,v28.16b
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v1.16b},[x0],16
+       aese            v0.16b,v9.16b
+       aesmc           v0.16b,v0.16b
+       add             v19.4s,v4.4s,v26.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       aese            v0.16b,v10.16b
+       aesmc           v0.16b,v0.16b
+       sha1h           s22,s24
+       aese            v0.16b,v11.16b
+       aesmc           v0.16b,v0.16b
+       add             v23.4s,v4.4s,v27.4s
+       /* no place to get rid of this stall */
+       rev32           v29.16b,v29.16b
+       sha1c           q24,s25,v19.4s
+       aese            v0.16b,v12.16b
+       aesmc           v0.16b,v0.16b
+       sha1su1         v26.4s,v29.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       aese            v0.16b,v13.16b
+       aesmc           v0.16b,v0.16b
+       sha1h           s21,s24
+       add             v19.4s,v4.4s,v28.4s
+       sha1c           q24,s22,v23.4s
+       aese            v0.16b,v14.16b
+       aesmc           v0.16b,v0.16b
+       add             v23.4s,v4.4s,v29.4s
+       sha1su1         v27.4s,v26.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       aese            v0.16b,v15.16b
+       aesmc           v0.16b,v0.16b
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       aese            v0.16b,v16.16b
+       aesmc           v0.16b,v0.16b
+       sha1su1         v28.4s,v27.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+___
+    &aes192_aes256_handle(1, "enc_mainloop", 0, 0);
+$code.=<<___;
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       sha1su1         v26.4s,v29.4s
+       /* aes xform 1, sha quad 1 */
+       eor             v1.16b,v1.16b,v0.16b    /* mode op 1 xor w/prev value */
+       /* save aes res, bump aes_out_ptr */
+       st1             {v0.16b},[x1],16
+       aese            v1.16b,v8.16b
+       aesmc           v1.16b,v1.16b
+       add             v19.4s,v5.4s,v28.4s
+       aese            v1.16b,v9.16b
+       aesmc           v1.16b,v1.16b
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       aese            v1.16b,v10.16b
+       aesmc           v1.16b,v1.16b
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v2.16b},[x0],16
+       add             v23.4s,v5.4s,v29.4s
+       sha1su1         v27.4s,v26.4s
+       aese            v1.16b,v11.16b
+       aesmc           v1.16b,v1.16b
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       aese            v1.16b,v12.16b
+       aesmc           v1.16b,v1.16b
+       sha1p           q24,s21,v19.4s
+       sha1su1         v28.4s,v27.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       aese            v1.16b,v13.16b
+       aesmc           v1.16b,v1.16b
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       aese            v1.16b,v14.16b
+       aesmc           v1.16b,v1.16b
+       add             v19.4s,v5.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+       add             x9,x9,64                /* bump lead_ptr */
+       sha1su0         v26.4s,v27.4s,v28.4s
+       aese            v1.16b,v15.16b
+       aesmc           v1.16b,v1.16b
+       sha1h           s22,s24
+       add             v23.4s,v5.4s,v27.4s
+       sha1p           q24,s21,v19.4s
+       aese            v1.16b,v16.16b
+       aesmc           v1.16b,v1.16b
+       sha1su1         v26.4s,v29.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+___
+    &aes192_aes256_handle(0, "enc_mainloop", 1, 0);
+$code.=<<___;
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v23.4s,v6.4s,v29.4s
+       sha1su1         v27.4s,v26.4s
+
+       /* mode op 2 */
+       eor             v2.16b,v2.16b,v1.16b    /* mode of 2 xor w/prev value */
+
+       /* aes xform 2, sha quad 2 */
+       aese            v2.16b,v8.16b
+       aesmc           v2.16b,v2.16b
+       /* save aes res, bump aes_out_ptr */
+       st1             {v1.16b},[x1],16
+
+       add             v19.4s,v6.4s,v28.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       aese            v2.16b,v9.16b
+       aesmc           v2.16b,v2.16b
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       aese            v2.16b,v10.16b
+       aesmc           v2.16b,v2.16b
+       sha1su1         v28.4s,v27.4s
+       aese            v2.16b,v11.16b
+       aesmc           v2.16b,v2.16b
+       add             v19.4s,v6.4s,v26.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       aese            v2.16b,v12.16b
+       aesmc           v2.16b,v2.16b
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       aese            v2.16b,v13.16b
+       aesmc           v2.16b,v2.16b
+       sha1su1         v29.4s,v28.4s
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v3.16b},[x0],16
+       aese            v2.16b,v14.16b
+       aesmc           v2.16b,v2.16b
+       add             v23.4s,v6.4s,v27.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       aese            v2.16b,v15.16b
+       aesmc           v2.16b,v2.16b
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       aese            v2.16b,v16.16b
+       aesmc           v2.16b,v2.16b
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v26.4s,v29.4s
+___
+    &aes192_aes256_handle(0, "enc_mainloop", 2, 0);
+$code.=<<___;
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v23.4s,v7.4s,v29.4s
+       sha1su1         v27.4s,v26.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+
+       /* mode op 3 */
+       eor             v3.16b,v3.16b,v2.16b    /* xor w/prev value */
+
+       sha1su1         v28.4s,v27.4s
+
+       /* aes xform 3, sha quad 3 */
+       aese            v3.16b,v8.16b
+       aesmc           v3.16b,v3.16b
+       sha1su0         v29.4s,v26.4s,v27.4s
+       /* save aes res, bump aes_out_ptr */
+       st1             {v2.16b},[x1],16
+       aese            v3.16b,v9.16b
+       aesmc           v3.16b,v3.16b
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       aese            v3.16b,v10.16b
+       aesmc           v3.16b,v3.16b
+       sha1su1         v29.4s,v28.4s
+       add             v19.4s,v7.4s,v26.4s
+       aese            v3.16b,v11.16b
+       aesmc           v3.16b,v3.16b
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       aese            v3.16b,v12.16b
+       aesmc           v3.16b,v3.16b
+       add             v23.4s,v7.4s,v27.4s
+       aese            v3.16b,v13.16b
+       aesmc           v3.16b,v3.16b
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       aese            v3.16b,v14.16b
+       aesmc           v3.16b,v3.16b
+       sub             x15,x15,1               /* dec block count */
+       add             v19.4s,v7.4s,v28.4s
+       aese            v3.16b,v15.16b
+       aesmc           v3.16b,v3.16b
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       aese            v3.16b,v16.16b
+       aesmc           v3.16b,v3.16b
+       add             v23.4s,v7.4s,v29.4s
+___
+    &aes192_aes256_handle(0, "enc_mainloop", 3, 0);
+$code.=<<___;
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       ldp             q26,q27,[x3],32
+
+       add             v25.4s,v25.4s,v21.4s
+       add             v24.4s,v24.4s,v20.4s
+       /* save aes res, bump aes_out_ptr */
+       st1             {v3.16b},[x1],16
+
+       ldp             q28,q29,[x3],32
+
+       sub             x5,x5,64
+       cbnz            x15,.Lenc_main_loop     /* loop if more to do */
+
+       mov             w15,0x80                /* that's the 1 of the pad */
+       /*
+        * epilog, process remaining aes blocks and b-2 sha block
+        * do this inline (no loop) to overlap with the sha part
+        * note there are 0-3 aes blocks left.
+        */
+       rev32           v26.16b,v26.16b         /* fix endian w0 */
+       rev32           v27.16b,v27.16b         /* fix endian w1 */
+       rev32           v28.16b,v28.16b         /* fix endian w2 */
+       rev32           v29.16b,v29.16b         /* fix endian w3 */
+       mov             v20.16b,v24.16b         /* working ABCD <- ABCD */
+       cbz             x13, .Lbm2fromQ0        /* skip if none left */
+
+       /*
+        * mode op 0
+        * read next aes block, update aes_ptr_in
+        */
+       ld1             {v0.16b},[x0],16
+       eor             v0.16b,v0.16b,v3.16b    /* xor w/ prev value */
+
+       /* aes xform 0, sha quad 0 */
+       add             v19.4s,v4.4s,v26.4s
+       aese            v0.16b,v8.16b
+       aesmc           v0.16b,v0.16b
+       add             v23.4s,v4.4s,v27.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       aese            v0.16b,v9.16b
+       aesmc           v0.16b,v0.16b
+       sha1h           s22,s24
+       sha1c           q24,s25,v19.4s
+       aese            v0.16b,v10.16b
+       aesmc           v0.16b,v0.16b
+       sha1su1         v26.4s,v29.4s
+       add             v19.4s,v4.4s,v28.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       aese            v0.16b,v11.16b
+       aesmc           v0.16b,v0.16b
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       aese            v0.16b,v12.16b
+       aesmc           v0.16b,v0.16b
+       sha1su1         v27.4s,v26.4s
+       add             v23.4s,v4.4s,v29.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       aese            v0.16b,v13.16b
+       aesmc           v0.16b,v0.16b
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       aese            v0.16b,v14.16b
+       aesmc           v0.16b,v0.16b
+       sha1su1         v28.4s,v27.4s
+       add             v19.4s,v4.4s,v26.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       aese            v0.16b,v15.16b
+       aesmc           v0.16b,v0.16b
+       sha1h           s21,s24
+       aese            v0.16b,v16.16b
+       aesmc           v0.16b,v0.16b
+       sha1c           q24,s22,v23.4s
+       sha1su1         v29.4s,v28.4s
+___
+    &aes192_aes256_handle(1, "enc_epilog", 0, 0);
+$code.=<<___;
+       /* local copy of aes_blocks_left */
+       subs            x14,x13,1
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+       /* save aes res, bump aes_out_ptr */
+       st1             {v0.16b},[x1],16
+       /* if aes_blocks_left_count == 0 */
+       beq             .Lbm2fromQ1
+       /*
+        * mode op 1
+        * read next aes block, update aes_ptr_in
+        */
+       ld1             {v1.16b},[x0],16
+
+       eor             v1.16b,v1.16b,v0.16b    /* xor w/ prev value */
+
+       /* aes xform 1, sha quad 1 */
+       aese            v1.16b,v8.16b
+       aesmc           v1.16b,v1.16b
+       add             v19.4s,v5.4s,v28.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       aese            v1.16b,v9.16b
+       aesmc           v1.16b,v1.16b
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       aese            v1.16b,v10.16b
+       aesmc           v1.16b,v1.16b
+       sha1su1         v27.4s,v26.4s
+       add             v23.4s,v5.4s,v29.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       aese            v1.16b,v11.16b
+       aesmc           v1.16b,v1.16b
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       aese            v1.16b,v12.16b
+       aesmc           v1.16b,v1.16b
+       sha1su1         v28.4s,v27.4s
+       add             v19.4s,v5.4s,v26.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       aese            v1.16b,v13.16b
+       aesmc           v1.16b,v1.16b
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       aese            v1.16b,v14.16b
+       aesmc           v1.16b,v1.16b
+       sha1su1         v29.4s,v28.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       aese            v1.16b,v15.16b
+       aesmc           v1.16b,v1.16b
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       aese            v1.16b,v16.16b
+       aesmc           v1.16b,v1.16b
+       sha1su1         v26.4s,v29.4s
+___
+       &aes192_aes256_handle(1, "enc_epilog", 1, 0);
+$code.=<<___;
+       subs            x14,x14,1               /* dec counter */
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+       /* save aes res, bump aes_out_ptr */
+       st1             {v1.16b},[x1],16
+       /* if aes_blocks_left_count == 0 */
+       beq             .Lbm2fromQ2
+
+       /*
+        * mode op 2
+        * read next aes block, update aes_ptr_in
+        */
+       ld1             {v2.16b},[x0],16
+       eor             v2.16b,v2.16b,v1.16b    /* xor w/ prev value */
+
+       /* aes xform 2, sha quad 2 */
+       aese            v2.16b,v8.16b
+       aesmc           v2.16b,v2.16b
+       add             v23.4s,v6.4s,v29.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       aese            v2.16b,v9.16b
+       aesmc           v2.16b,v2.16b
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       aese            v2.16b,v10.16b
+       aesmc           v2.16b,v2.16b
+       sha1su1         v28.4s,v27.4s
+       add             v19.4s,v6.4s,v26.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       aese            v2.16b,v11.16b
+       aesmc           v2.16b,v2.16b
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       aese            v2.16b,v12.16b
+       aesmc           v2.16b,v2.16b
+       sha1su1         v29.4s,v28.4s
+       add             v23.4s,v6.4s,v27.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       aese            v2.16b,v13.16b
+       aesmc           v2.16b,v2.16b
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       aese            v2.16b,v14.16b
+       aesmc           v2.16b,v2.16b
+       sha1su1         v26.4s,v29.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       aese            v2.16b,v15.16b
+       aesmc           v2.16b,v2.16b
+       sha1h           s21,s24
+       aese            v2.16b,v16.16b
+       aesmc           v2.16b,v2.16b
+       sha1m           q24,s22,v23.4s
+       sha1su1         v27.4s,v26.4s
+___
+       &aes192_aes256_handle(1, "enc_epilog", 2, 0);
+$code.=<<___;
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v7.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+       /* save aes res, bump aes_out_ptr */
+       st1             {v2.16b},[x1],16
+       /* join common code at Quad 3 */
+       b               .Lbm2fromQ3
+
+       /*
+        * now there is the b-2 sha block before the final one. Execution takes over
+        * in the appropriate part of this depending on how many aes blocks were left.
+        * If there were none, the whole thing is executed.
+        */
+.Lbm2fromQ0:
+       add             v19.4s,v4.4s,v26.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s25,v19.4s
+       add             v23.4s,v4.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v4.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+.Lbm2fromQ1:
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+.Lbm2fromQ2:
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v7.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+.Lbm2fromQ3:
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v7.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v23.4s,v7.4s,v27.4s
+       sha1h           s21,s24
+       eor             v26.16b,v26.16b,v26.16b         /* zero reg */
+       sha1p           q24,s22,v23.4s
+
+       add             v19.4s,v7.4s,v28.4s
+       sha1h           s22,s24
+       eor             v27.16b,v27.16b,v27.16b         /* zero reg */
+       sha1p           q24,s21,v19.4s
+
+       add             v23.4s,v7.4s,v29.4s
+       sha1h           s21,s24
+       eor             v28.16b,v28.16b,v28.16b         /* zero reg */
+       sha1p           q24,s22,v23.4s
+
+       add             v25.4s,v25.4s,v21.4s
+       add             v24.4s,v24.4s,v20.4s
+
+       /* Process remaining 0-3 AES blocks here */
+       eor             v29.16b,v29.16b,v29.16b         /* zero sha src 3 */
+
+       cbz             x13,.Lpost_long_Q0
+
+       /* 1st remaining AES block */
+       ld1             {v26.16b},[x3],16
+       sub             x5,x5,16
+       rev32           v26.16b,v26.16b
+       subs            x14,x13,1
+       b.eq            .Lpost_long_Q1
+
+       /* 2nd remaining AES block */
+       ld1             {v27.16b},[x3],16
+       sub             x5,x5,16
+       rev32           v27.16b,v27.16b
+       subs            x14,x14,1
+       b.eq            .Lpost_long_Q2
+
+       /* 3rd remaining AES block */
+       ld1             {v28.16b},[x3],16
+       sub             x5,x5,16
+       rev32           v28.16b,v28.16b
+       /* Allow for filling this sha1 block with the remaining digest src */
+       b               .Lpost_long_Q3
+       /*
+        * Process remaining 8B blocks of the digest
+        */
+.Lpost_long_Q0:
+       /* blk 0,1 */
+       /* assume final block */
+       mov             v26.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_long_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v26 value (0x80) */
+       mov             v26.d[0],x2
+       /* assume this was final block */
+       mov             v26.b[11],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_long_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       mov             v26.d[1],x2
+
+.Lpost_long_Q1:
+       /* blk 2,3 */
+       /* assume this is final block */
+       mov             v27.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_long_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v27 value (0x80) */
+       mov             v27.d[0],x2
+       /* assume this was final block */
+       mov             v27.b[11],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_long_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       mov             v27.d[1],x2
+
+.Lpost_long_Q2:
+       /* blk 4,5 */
+       /* assume this was final block */
+       mov             v28.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_long_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v28 value (0x80) */
+       mov             v28.d[0],x2
+       /* assume this was final block */
+       mov             v28.b[11],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_long_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       mov             v28.d[1],x2
+
+.Lpost_long_Q3:
+       /* blk 6,7 */
+       /* assume this was final block */
+       mov             v29.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_long_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v29 value (0x80) */
+       mov             v29.d[0],x2
+       /* assume this was final block */
+       mov             v29.b[11],w15
+       /*
+        * Outstanding 8B blocks left.
+        * Since there has to be another sha block with padding,
+        * we need to calculate hash without padding here.
+        */
+       cbz             x5,1f
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       rev32           x2,x2
+       /*
+        * Don't decrease x5 here.
+        * Use it to indicate necessity of constructing "1" padding at the end.
+        */
+       mov             v29.d[1],x2
+       /*
+        * That is enough of blocks, we allow up to 64 bytes in total.
+        * Now we have the sha1 to do for these 4 16B blocks
+        */
+1:
+       mov             v20.16b,v24.16b         /* working ABCD <- ABCD */
+       add             v19.4s,v4.4s,v26.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s25,v19.4s
+       add             v23.4s,v4.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v4.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v7.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v7.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v23.4s,v7.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v19.4s,v7.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v23.4s,v7.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v25.4s,v25.4s,v21.4s
+       add             v24.4s,v24.4s,v20.4s
+
+       eor             v26.16b,v26.16b,v26.16b         /* zero sha src 0 */
+       eor             v27.16b,v27.16b,v27.16b         /* zero sha src 1 */
+       eor             v28.16b,v28.16b,v28.16b         /* zero sha src 2 */
+       eor             v29.16b,v29.16b,v29.16b         /* zero sha src 3 */
+
+       /* this was final block */
+       cbz             x5,.Lpost_long_loop
+       subs            x5,x5,8
+       /* loop if hash is not finished */
+       b.ne            .Lpost_long_Q0
+       /* set "1" of the padding if this was a final block */
+       mov             v26.b[3],w15
+
+.Lpost_long_loop:
+       /* Add outstanding bytes of digest source */
+       add             x11,x11,x8
+       /* Add one SHA-1 block since hash is calculated including i_key_pad */
+       add             x11,x11, #64
+       lsr             x12,x11,32                      /* len_hi */
+       and             x13,x11,0xffffffff              /* len_lo */
+       lsl             x12,x12,3                       /* len_hi in bits */
+       lsl             x13,x13,3                       /* len_lo in bits */
+
+       mov             v29.s[3],w13                    /* len_lo */
+       mov             v29.s[2],w12                    /* len_hi */
+
+       /* do last sha of pad block */
+       mov             v20.16b,v24.16b                 /* working ABCD <- ABCD */
+       add             v19.4s,v4.4s,v26.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s25,v19.4s
+       add             v23.4s,v4.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v4.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v7.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v7.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v23.4s,v7.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v19.4s,v7.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v23.4s,v7.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v26.4s,v24.4s,v20.4s
+       add             v27.4s,v25.4s,v21.4s
+
+       /* Calculate final HMAC */
+       eor             v28.16b, v28.16b, v28.16b
+       eor             v29.16b, v29.16b, v29.16b
+       /* load o_key_pad partial hash */
+       ldr             q24, [x7]
+       eor             v25.16b, v25.16b, v25.16b
+       ldr             s25, [x7, #16]
+
+       mov             v20.16b,v24.16b         /* working ABCD <- ABCD */
+
+       /* Set padding 1 to the first reg */
+       mov             w11, #0x80              /* that's the 1 of the pad */
+       mov             v27.b[7], w11
+
+       mov             x11, #64+20             /* size of o_key_pad + inner hash */
+       lsl             x11, x11, 3
+       /* move length to the end of the block */
+       mov             v29.s[3], w11
+       lsr             x11, x11, 32
+       mov             v29.s[2], w11           /* and the higher part */
+
+       add             v19.4s,v4.4s,v26.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s25,v19.4s
+       add             v23.4s,v4.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v4.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v7.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v7.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v23.4s,v7.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v19.4s,v7.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       ldp             d10,d11,[sp,#16]
+       ldp             d12,d13,[sp,#32]
+
+       add             v23.4s,v7.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       ldp             d14,d15,[sp,#48]
+       ldp             d8,d9,[sp],#64
+
+       mov             x0, xzr
+
+       add             v24.4s,v24.4s,v20.4s
+       add             v25.4s,v25.4s,v21.4s
+       rev32           v24.16b, v24.16b
+       rev32           v25.16b, v25.16b
+
+       st1             {v24.16b}, [x4],16
+       st1             {v25.s}[0], [x4]
+
+       ret
+
+/*
+ * These are the short cases (less efficient), here used for 1-11 aes blocks.
+ * x10 = aes_blocks
+ */
+.Lenc_short_cases:
+       ldp             q8,q9,[x9],32
+       adr             x8,.Lrcon                       /* rcon */
+       mov             w15,0x80                        /* sha padding word */
+       ldp             q10,q11,[x9],32
+       lsl             x11,x10,4                       /* len = aes_blocks*16 */
+       eor             v26.16b,v26.16b,v26.16b         /* zero sha src 0 */
+       ldp             q12,q13,[x9],32
+       eor             v27.16b,v27.16b,v27.16b         /* zero sha src 1 */
+       eor             v28.16b,v28.16b,v28.16b         /* zero sha src 2 */
+       ldp             q14,q15,[x9],32
+       eor             v29.16b,v29.16b,v29.16b         /* zero sha src 3 */
+       ldp             q4,q5,[x8],32                   /* key0, key1 */
+       ldp             q16,q17,[x9],32
+       ld1             {v3.16b},[x6]                   /* get ivec */
+       ldp             q6,q7,[x8]                      /* key2, key3 */
+       /* get outstanding bytes of the digest */
+       sub             x8,x5,x2
+       /*
+        * the idea in the short loop (at least 1) is to break out with the padding
+        * already in place excepting the final word.
+        */
+.Lenc_short_loop:
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v0.16b},[x0],16
+       eor             v0.16b,v0.16b,v3.16b            /* xor w/ prev value */
+
+       /* aes xform 0 */
+       aese            v0.16b,v8.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v9.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v10.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v11.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v12.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v13.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v14.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v15.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v16.16b
+       aesmc           v0.16b,v0.16b
+___
+    &aes192_aes256_handle(1, "enc_short", 0, 1);
+$code.=<<___;
+       /* save aes res, bump aes_out_ptr */
+       st1             {v0.16b},[x1],16
+       /* load next 16 bytes for SHA-1 */
+       ld1             {v26.16b},[x3],16
+       /* dec number of bytes of the hash input */
+       sub             x5,x5,16
+       sub             x10,x10,1                       /* dec num_blocks */
+       /* load res to sha 0, endian swap */
+       rev32   v26.16b,v26.16b
+       cbz             x10,.Lpost_short_Q1             /* break if no more */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v1.16b},[x0],16
+       eor             v1.16b,v1.16b,v0.16b            /* xor w/ prev value */
+
+       /* aes xform 1 */
+       aese            v1.16b,v8.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v9.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v10.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v11.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v12.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v13.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v14.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v15.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v16.16b
+       aesmc           v1.16b,v1.16b
+___
+    &aes192_aes256_handle(1, "enc_short", 1, 0);
+$code.=<<___;
+       /* save aes res, bump aes_out_ptr */
+       st1             {v1.16b},[x1],16
+       /* load next 16 bytes for SHA-1 */
+       ld1             {v27.16b},[x3],16
+       /* dec number of bytes of the hash input */
+       sub             x5,x5,16
+       sub             x10,x10,1                       /* dec num_blocks */
+       /* load res to sha 0, endian swap */
+       rev32   v27.16b,v27.16b
+       cbz             x10,.Lpost_short_Q2             /* break if no more */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v2.16b},[x0],16
+       eor             v2.16b,v2.16b,v1.16b            /* xor w/ prev value */
+
+       /* aes xform 2 */
+       aese            v2.16b,v8.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v9.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v10.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v11.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v12.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v13.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v14.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v15.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v16.16b
+       aesmc           v2.16b,v2.16b
+___
+    &aes192_aes256_handle(1, "enc_short", 2, 0);
+$code.=<<___;
+       /* save aes res, bump aes_out_ptr */
+       st1             {v2.16b},[x1],16
+       /* load next 16 bytes for SHA-1 */
+       ld1             {v28.16b},[x3],16
+       /* dec number of bytes of the hash input */
+       sub             x5,x5,16
+       sub             x10,x10,1                       /* dec num_blocks */
+       /* load res to sha 0, endian swap */
+       rev32   v28.16b,v28.16b
+       cbz             x10,.Lpost_short_Q3             /* break if no more */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v3.16b},[x0],16
+       eor             v3.16b,v3.16b,v2.16b            /* xor w/prev value */
+
+       /* aes xform 3 */
+       aese            v3.16b,v8.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v9.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v10.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v11.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v12.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v13.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v14.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v15.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v16.16b
+       aesmc           v3.16b,v3.16b
+___
+    &aes192_aes256_handle(1, "enc_short", 3, 0);
+$code.=<<___;
+       /* save aes res, bump aes_out_ptr */
+       st1             {v3.16b},[x1],16
+       /* load next 16 bytes for SHA-1 */
+       ld1             {v29.16b},[x3],16
+       /* dec number of bytes of the hash input */
+       sub             x5,x5,16
+       mov             v20.16b,v24.16b                 /* working ABCD <- ABCD */
+       /* load res to sha 0, endian swap */
+       rev32   v29.16b,v29.16b
+       /*
+        * now we have the sha1 to do for these 4 aes blocks
+        */
+       add             v19.4s,v4.4s,v26.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s25,v19.4s
+       add             v23.4s,v4.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v4.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v7.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v7.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v23.4s,v7.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v19.4s,v7.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v23.4s,v7.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v25.4s,v25.4s,v21.4s
+       add             v24.4s,v24.4s,v20.4s
+
+       eor             v26.16b,v26.16b,v26.16b         /* zero sha src 0 */
+       eor             v27.16b,v27.16b,v27.16b         /* zero sha src 1 */
+       eor             v28.16b,v28.16b,v28.16b         /* zero sha src 2 */
+       eor             v29.16b,v29.16b,v29.16b         /* zero sha src 3 */
+
+       sub             x10,x10,1                       /* dec num_blocks */
+       cbnz            x10,.Lenc_short_loop            /* keep looping if more */
+
+.Lpost_short_Q0:
+       /* assume this was final block */
+       mov             v26.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_short_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v26 value (0x80) */
+       mov             v26.d[0],x2
+       /* assume this was final block */
+       mov             v26.b[11],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_short_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       mov             v26.d[1],x2
+.Lpost_short_Q1:
+       /* zero out vectors */
+       eor             v27.16b,v27.16b,v27.16b
+       eor             v28.16b,v28.16b,v28.16b
+       eor             v29.16b,v29.16b,v29.16b
+       /* assume this is final block */
+       mov             v27.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_short_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v27 value (0x80) */
+       mov             v27.d[0],x2
+       /* assume this was final block */
+       mov             v27.b[11],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_short_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       mov             v27.d[1],x2
+.Lpost_short_Q2:
+       /* zero out vectors (repeated if came from Q0) */
+       eor             v28.16b,v28.16b,v28.16b
+       eor             v29.16b,v29.16b,v29.16b
+       /* assume this was final block */
+       mov             v28.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_short_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v28 value (0x80) */
+       mov             v28.d[0],x2
+       /* assume this was final block */
+       mov             v28.b[11],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_short_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       mov             v28.d[1],x2
+.Lpost_short_Q3:
+       /* zero out vector (repeated if came from Q1) */
+       eor             v29.16b,v29.16b,v29.16b
+       /* assume this was final block */
+       mov             v29.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_short_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v29 value (0x80) */
+       mov             v29.d[0],x2
+       /* assume this was final block */
+       mov             v29.b[11],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,1f
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       rev32           x2,x2
+       mov             v29.d[1],x2
+       /*
+        * That is enough of blocks, we allow up to 64 bytes in total.
+        * Now we have the sha1 to do for these 4 16B blocks
+        */
+1:
+       mov             v20.16b,v24.16b         /* working ABCD <- ABCD */
+
+       add             v19.4s,v4.4s,v26.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s25,v19.4s
+       add             v23.4s,v4.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v4.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v7.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v7.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v23.4s,v7.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v19.4s,v7.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v23.4s,v7.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v25.4s,v25.4s,v21.4s
+       add             v24.4s,v24.4s,v20.4s
+
+       eor             v26.16b,v26.16b,v26.16b         /* zero sha src 0 */
+       eor             v27.16b,v27.16b,v27.16b         /* zero sha src 1 */
+       eor             v28.16b,v28.16b,v28.16b         /* zero sha src 2 */
+       eor             v29.16b,v29.16b,v29.16b         /* zero sha src 3 */
+
+       /* this was final block */
+       cbz             x5,.Lpost_short_loop
+       subs            x5,x5,8
+       /* loop if hash is not finished */
+       b.ne            .Lpost_short_Q0
+       /* set "1" of the padding if this was a final block */
+       mov             v26.b[3],w15
+
+/*
+ * there are between 0 and 3 aes blocks in the final sha1 blocks
+ */
+.Lpost_short_loop:
+       /* Add outstanding bytes of digest source */
+       add             x11,x11,x8
+       /* Add one SHA-1 block since hash is calculated including i_key_pad */
+       add             x11,x11, #64
+       lsr             x12,x11,32                      /* len_hi */
+       and             x13,x11,0xffffffff              /* len_lo */
+       lsl             x12,x12,3                       /* len_hi in bits */
+       lsl             x13,x13,3                       /* len_lo in bits */
+
+       mov             v29.s[3],w13                    /* len_lo */
+       mov             v29.s[2],w12                    /* len_hi */
+
+       /* do final block */
+       mov             v20.16b,v24.16b                 /* working ABCD <- ABCD */
+       add             v19.4s,v4.4s,v26.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s25,v19.4s
+       add             v23.4s,v4.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v4.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v7.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v7.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v23.4s,v7.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v19.4s,v7.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v23.4s,v7.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v26.4s,v24.4s,v20.4s
+       add             v27.4s,v25.4s,v21.4s
+
+       /* Calculate final HMAC */
+       eor             v28.16b, v28.16b, v28.16b
+       eor             v29.16b, v29.16b, v29.16b
+       /* load o_key_pad partial hash */
+       ldr             q24, [x7]
+       eor             v25.16b, v25.16b, v25.16b
+       ldr             s25, [x7, #16]
+       /* Set padding 1 to the first reg */
+       mov             w11, #0x80              /* that's the 1 of the pad */
+       mov             v27.b[7], w11
+
+       mov             v20.16b,v24.16b         /* working ABCD <- ABCD */
+
+       mov             x11, #64+20             /* size of o_key_pad + inner hash */
+       lsl             x11, x11, 3
+       /* move length to the end of the block */
+       mov             v29.s[3], w11
+       lsr             x11, x11, 32
+       mov             v29.s[2], w11           /* and the higher part */
+       add             v19.4s,v4.4s,v26.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s25,v19.4s
+       add             v23.4s,v4.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v4.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v7.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v7.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v23.4s,v7.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v19.4s,v7.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       ldp             d10,d11,[sp,#16]
+       ldp             d12,d13,[sp,#32]
+
+       add             v23.4s,v7.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       ldp             d14,d15,[sp,#48]
+       ldp             d8,d9,[sp],#64
+
+       mov             x0, xzr
+
+       add             v24.4s,v24.4s,v20.4s
+       add             v25.4s,v25.4s,v21.4s
+       rev32           v24.16b, v24.16b
+       rev32           v25.16b, v25.16b
+
+       st1             {v24.16b}, [x4],16
+       st1             {v25.s}[0], [x4]
+
+       ret
+
+.size  asm_aescbc_sha1_hmac, .-asm_aescbc_sha1_hmac
+
+# Description:
+#
+# Combined Auth/Dec Primitive = sha1_hmac/aes128cbc
+#
+# Operations:
+#
+# out = decrypt-AES128CBC(in)
+# return_ash_ptr = SHA1(o_key_pad | SHA1(i_key_pad | in))
+#
+# Prototype:
+# asm_sha1_hmac_aescbc_dec(uint8_t *csrc, uint8_t *cdst, uint64_t clen,
+#                      uint8_t *dsrc, uint8_t *ddst, uint64_t dlen,
+#                      CIPH_DIGEST  *arg)
+#
+# Registers used:
+#
+# asm_sha1_hmac_aescbc_dec(
+#      csrc,   x0      (cipher src address)
+#      cdst,   x1      (cipher dst address)
+#      clen    x2      (cipher length)
+#      dsrc,   x3      (digest src address)
+#      ddst,   x4      (digest dst address)
+#      dlen,   x5      (digest length)
+#      arg     x6      :
+#              arg->cipher.key                 (round keys)
+#              arg->cipher.key_rounds          (key rounds)
+#              arg->cipher.iv                  (initialization vector)
+#              arg->digest.hmac.i_key_pad      (partially hashed i_key_pad)
+#              arg->digest.hmac.o_key_pad      (partially hashed o_key_pad)
+#
+#
+# Routine register definitions:
+#
+# v0 - v3 -- aes results
+# v4 - v7 -- round consts for sha
+# v8 - v18 -- round keys
+# v19 -- temp register for SHA1
+# v20 -- ABCD copy (q20)
+# v21 -- sha working state (q21)
+# v22 -- sha working state (q22)
+# v23 -- temp register for SHA1
+# v24 -- sha state ABCD
+# v25 -- sha state E
+# v26 -- sha block 0
+# v27 -- sha block 1
+# v28 -- sha block 2
+# v29 -- sha block 3
+# v30 -- reserved
+# v31 -- reserved
+#
+#
+# Constraints:
+#
+# The variable "clen" must be a multiple of 16, otherwise results are not
+# defined. For AES partial blocks the user is required to pad the input
+# to modulus 16 = 0.
+#
+# The variable "dlen" must be a multiple of 8 and greater or equal to "clen".
+# The maximum difference between "dlen" and "clen" cannot exceed 64 bytes.
+# This constrain is strictly related to the needs of the IPSec ESP packet.
+# Short lengths are less optimized at < 16 AES blocks, however they are
+# somewhat optimized, and more so than the enc/auth versions.
+
+.global asm_sha1_hmac_aescbc_dec
+.type  asm_sha1_hmac_aescbc_dec,%function
+
+asm_sha1_hmac_aescbc_dec:
+       AARCH64_VALID_CALL_TARGET
+       /* protect registers */
+       stp             d8,d9,[sp,#-64]!
+       /* fetch args */
+       ldr             x7, [x6, #HMAC_IKEYPAD]
+       /* init ABCD, E */
+       ldr             q24, [x7]
+       eor             v25.16b, v25.16b, v25.16b
+       ldr             s25, [x7, #16]
+       /* save pointer to o_key_pad partial hash */
+       ldr             x7, [x6, #HMAC_OKEYPAD]
+
+       stp             d10,d11,[sp,#16]
+
+       prfm            PLDL1KEEP,[x0,0]        /* pref next aes_ptr_in */
+       prfm            PLDL1KEEP,[x1,0]        /* pref next aes_ptr_out */
+       lsr             x10,x2,4                /* aes_blocks = len/16 */
+
+       stp             d12,d13,[sp,#32]
+       stp             d14,d15,[sp,#48]
+
+       ldr             x9, [x6, #CIPHER_KEY]
+       ldr             x16, [x6, #CIPHER_KEY_ROUNDS]
+       ldr             x6, [x6, #CIPHER_IV]
+       add             x17, x9, #160           /* point to the last 5 rounds keys */
+       /*
+        * init sha state, prefetch, check for small cases.
+        * Note that the output is prefetched as a load, for the in-place case
+        */
+       cmp             x10,16                  /* no main loop if <16 */
+       blt             .Ldec_short_cases       /* branch if < 12 */
+
+       /* base address for sha round consts */
+       adr             x8,.Lrcon
+       ldp             q4,q5,[x8],32           /* key0,key1 */
+       ldp             q6,q7,[x8],32           /* key2,key3 */
+
+       /* get outstanding bytes of the digest */
+       sub             x8,x5,x2
+
+       mov             x11,x2                  /* len -> x11 needed at end */
+       ld1             {v30.16b},[x6]          /* get 1st ivec */
+       lsr             x12,x11,6               /* total_blocks (sha) */
+       ldp             q26,q27,[x3],32         /* next w0,w1 */
+       rev32           v26.16b,v26.16b         /* endian swap w0 */
+       rev32           v27.16b,v27.16b         /* endian swap w1 */
+       ldp             q28,q29,[x3],32         /* next w1,w2 */
+       rev32           v28.16b,v28.16b         /* endian swap w2 */
+       rev32           v29.16b,v29.16b         /* endian swap w3 */
+
+       /* substract loaded bytes */
+       sub             x5,x5,64
+       /*
+        * now we can do the loop prolog, 1st sha1 block
+        */
+       prfm            PLDL1KEEP,[x0,64]       /* pref next aes_ptr_in */
+       prfm            PLDL1KEEP,[x1,64]       /* pref next aes_ptr_out */
+       /*
+        * do the first sha1 block on the plaintext
+        */
+       mov             v20.16b,v24.16b         /* init working ABCD */
+
+       add             v19.4s,v4.4s,v26.4s
+       add             v23.4s,v4.4s,v27.4s
+       /* quad 0 */
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       ld1             {v8.16b},[x9],16        /* rk[0] */
+       sha1c           q24,s25,v19.4s
+       sha1su1         v26.4s,v29.4s
+       ld1             {v9.16b},[x9],16        /* rk[1] */
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       add             v19.4s,v4.4s,v28.4s
+       ld1             {v10.16b},[x9],16       /* rk[2] */
+       sha1c           q24,s22,v23.4s
+       sha1su1         v27.4s,v26.4s
+       add             v23.4s,v4.4s,v29.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       ld1             {v11.16b},[x9],16       /* rk[3] */
+       sha1c           q24,s21,v19.4s
+       sha1su1         v28.4s,v27.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       ld1             {v12.16b},[x9],16       /* rk[4] */
+       sha1c           q24,s21,v19.4s
+       add             v19.4s,v5.4s,v28.4s
+       sha1su1         v26.4s,v29.4s
+       ld1             {v13.16b},[x9],16       /* rk[5] */
+       /* quad 1 */
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       ld1             {v14.16b},[x9],16       /* rk[6] */
+       sha1p           q24,s22,v23.4s
+       sha1su1         v27.4s,v26.4s
+       add             v23.4s,v5.4s,v29.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       ld1             {v15.16b},[x9],16       /* rk[7] */
+       sha1p           q24,s21,v19.4s
+       sha1su1         v28.4s,v27.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       ld1             {v16.16b},[x9],16       /* rk[8] */
+       sha1p           q24,s21,v19.4s
+       sha1su1         v26.4s,v29.4s
+       ld1             {v17.16b},[x9],16       /* rk[9] */
+       add             v19.4s,v6.4s,v28.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       ld1             {v18.16b},[x9],16       /* rk[10] */
+       sha1p           q24,s22,v23.4s
+       sha1su1         v27.4s,v26.4s
+       /* quad 2 */
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+       add             v23.4s,v6.4s,v27.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v26.4s,v29.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v23.4s,v7.4s,v29.4s
+       sha1su1         v27.4s,v26.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       sha1su1         v28.4s,v27.4s
+       /* quad 3 */
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v7.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+       sha1h           s22,s24
+       ld1             {v26.16b},[x3],16       /* next w0 */
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v7.4s,v27.4s
+       sha1h           s21,s24
+       ld1             {v27.16b},[x3],16       /* next w1 */
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v7.4s,v28.4s
+       sha1h           s22,s24
+       ld1             {v28.16b},[x3],16       /* next w2 */
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v7.4s,v29.4s
+       sha1h           s21,s24
+       ld1             {v29.16b},[x3],16       /* next w3 */
+       sha1p           q24,s22,v23.4s
+
+       /* substract loaded bytes */
+       sub             x5,x5,64
+       /*
+        * aes_blocks_left := number after the main (sha) block is done.
+        * can be 0 note we account for the extra unwind in main_blocks
+        */
+       sub             x15,x12,2               /* main_blocks=total_blocks-5 */
+       add             v24.4s,v24.4s,v20.4s
+       and             x13,x10,3               /* aes_blocks_left */
+       ld1             {v0.16b},[x0]           /* next aes block, no update */
+       add             v25.4s,v25.4s,v21.4s
+       /* next aes block, update aes_ptr_in */
+       ld1             {v31.16b},[x0],16
+
+       /* indicate AES blocks to write back */
+       mov             x9,xzr
+       /*
+        * main combined loop CBC, can be used by auth/enc version
+        */
+.Ldec_main_loop:
+       /*
+        * Because both mov, rev32 and eor have a busy cycle,
+        * this takes longer than it looks.
+        */
+       rev32           v26.16b,v26.16b         /* fix endian w0 */
+       mov             v20.16b,v24.16b         /* working ABCD <- ABCD */
+       rev32           v27.16b,v27.16b         /* fix endian w1 */
+       /* pref next aes_ptr_out, streaming */
+       prfm            PLDL1KEEP,[x1,64]
+       /* aes xform 0, sha quad 0 */
+       aesd            v0.16b,v8.16b
+       aesimc          v0.16b,v0.16b
+       rev32           v28.16b,v28.16b         /* fix endian w2 */
+       aesd            v0.16b,v9.16b
+       aesimc          v0.16b,v0.16b
+       add             v19.4s,v4.4s,v26.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       aesd            v0.16b,v10.16b
+       aesimc          v0.16b,v0.16b
+       sha1h           s22,s24
+       aesd            v0.16b,v11.16b
+       aesimc          v0.16b,v0.16b
+       add             v23.4s,v4.4s,v27.4s
+       rev32           v29.16b,v29.16b         /* fix endian w3 */
+       /* read next aes block, no update */
+       ld1             {v1.16b},[x0]
+       sha1c           q24,s25,v19.4s
+       aesd            v0.16b,v12.16b
+       aesimc          v0.16b,v0.16b
+       sha1su1         v26.4s,v29.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       aesd            v0.16b,v13.16b
+       aesimc          v0.16b,v0.16b
+       sha1h           s21,s24
+       add             v19.4s,v4.4s,v28.4s
+       sha1c           q24,s22,v23.4s
+       aesd            v0.16b,v14.16b
+       aesimc          v0.16b,v0.16b
+       add             v23.4s,v4.4s,v29.4s
+       sha1su1         v27.4s,v26.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       aesd            v0.16b,v15.16b
+       aesimc          v0.16b,v0.16b
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       aesd            v0.16b,v16.16b
+       aesimc          v0.16b,v0.16b
+       sha1su1         v28.4s,v27.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+___
+       &aes192_aes256_dec_handle(1,"dec_mainloop",0,0);
+$code.=<<___;
+       add             v19.4s,v4.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+       eor             v0.16b,v0.16b,v30.16b   /* xor w/ prev value */
+       /* get next aes block, with update */
+       ld1             {v30.16b},[x0],16
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+       /* aes xform 1, sha quad 1 */
+       sha1su0         v27.4s,v28.4s,v29.4s
+       /* save aes res, bump aes_out_ptr */
+       st1             {v0.16b},[x1],16
+       aesd            v1.16b,v8.16b
+       aesimc          v1.16b,v1.16b
+       sha1h           s21,s24
+       add             v19.4s,v5.4s,v28.4s
+       sha1p           q24,s22,v23.4s
+       sha1su1         v27.4s,v26.4s
+       aesd            v1.16b,v9.16b
+       aesimc          v1.16b,v1.16b
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       aesd            v1.16b,v10.16b
+       aesimc          v1.16b,v1.16b
+       /* read next aes block, no update */
+       ld1             {v2.16b},[x0]
+       add             v23.4s,v5.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+       aesd            v1.16b,v11.16b
+       aesimc          v1.16b,v1.16b
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       aesd            v1.16b,v12.16b
+       aesimc          v1.16b,v1.16b
+       sha1p           q24,s22,v23.4s
+       sha1su1         v29.4s,v28.4s
+       aesd            v1.16b,v13.16b
+       aesimc          v1.16b,v1.16b
+       sha1h           s22,s24
+       add             v19.4s,v5.4s,v26.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1p           q24,s21,v19.4s
+       aesd            v1.16b,v14.16b
+       aesimc          v1.16b,v1.16b
+       sha1su1         v26.4s,v29.4s
+       aesd            v1.16b,v15.16b
+       aesimc          v1.16b,v1.16b
+       add             v23.4s,v5.4s,v27.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       aesd            v1.16b,v16.16b
+       aesimc          v1.16b,v1.16b
+       sha1su1         v27.4s,v26.4s
+___
+       &aes192_aes256_dec_handle(1,"dec_mainloop",1,0);
+$code.=<<___;
+       add             v19.4s,v6.4s,v28.4s
+       add             v23.4s,v6.4s,v29.4s
+       eor             v1.16b,v1.16b,v31.16b   /* mode op 1 xor w/prev value */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v31.16b},[x0],16
+       /* aes xform 2, sha quad 2 */
+       sha1su0         v28.4s,v29.4s,v26.4s
+       aesd            v2.16b,v8.16b
+       aesimc          v2.16b,v2.16b
+       /* save aes res, bump aes_out_ptr */
+       st1             {v1.16b},[x1],16
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       aesd            v2.16b,v9.16b
+       aesimc          v2.16b,v2.16b
+       sha1su1         v28.4s,v27.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       aesd            v2.16b,v10.16b
+       aesimc          v2.16b,v2.16b
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       aesd            v2.16b,v11.16b
+       aesimc          v2.16b,v2.16b
+       sha1su1         v29.4s,v28.4s
+       add             v19.4s,v6.4s,v26.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       aesd            v2.16b,v12.16b
+       aesimc          v2.16b,v2.16b
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       aesd            v2.16b,v13.16b
+       aesimc          v2.16b,v2.16b
+       sha1su1         v26.4s,v29.4s
+       add             v23.4s,v6.4s,v27.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       /* read next aes block, no update */
+       ld1             {v3.16b},[x0]
+       aesd            v2.16b,v14.16b
+       aesimc          v2.16b,v2.16b
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       aesd            v2.16b,v15.16b
+       aesimc          v2.16b,v2.16b
+       sha1su1         v27.4s,v26.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1h           s22,s24
+       aesd            v2.16b,v16.16b
+       aesimc          v2.16b,v2.16b
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1m           q24,s21,v19.4s
+___
+       &aes192_aes256_dec_handle(1,"dec_mainloop",2,0);
+$code.=<<___;
+       sha1su1         v28.4s,v27.4s
+       add             v23.4s,v7.4s,v29.4s
+       add             v19.4s,v7.4s,v26.4s
+       eor             v2.16b,v2.16b,v30.16b   /* mode of 2 xor w/prev value */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v30.16b},[x0],16
+       /* aes xform 3, sha quad 3 */
+       aesd            v3.16b,v8.16b
+       aesimc          v3.16b,v3.16b
+       /* save aes res, bump aes_out_ptr */
+       st1             {v2.16b},[x1],16
+       sha1h           s21,s24
+       aesd            v3.16b,v9.16b
+       aesimc          v3.16b,v3.16b
+       sha1su0         v29.4s,v26.4s,v27.4s
+       aesd            v3.16b,v10.16b
+       aesimc          v3.16b,v3.16b
+       sha1p           q24,s22,v23.4s
+       sha1su1         v29.4s,v28.4s
+       aesd            v3.16b,v11.16b
+       aesimc          v3.16b,v3.16b
+       sha1h           s22,s24
+       ld1             {v26.16b},[x3],16       /* next w0 */
+       sha1p           q24,s21,v19.4s
+       aesd            v3.16b,v12.16b
+       aesimc          v3.16b,v3.16b
+       add             v23.4s,v7.4s,v27.4s
+       aesd            v3.16b,v13.16b
+       aesimc          v3.16b,v3.16b
+       sha1h           s21,s24
+       ld1             {v27.16b},[x3],16       /* next w1 */
+       sha1p           q24,s22,v23.4s
+       aesd            v3.16b,v14.16b
+       aesimc          v3.16b,v3.16b
+       sub             x15,x15,1               /* dec block count */
+       add             v19.4s,v7.4s,v28.4s
+       aesd            v3.16b,v15.16b
+       aesimc          v3.16b,v3.16b
+       ld1             {v0.16b},[x0]           /* next aes block, no update */
+       sha1h           s22,s24
+       ld1             {v28.16b},[x3],16       /* next w2 */
+       sha1p           q24,s21,v19.4s
+       aesd            v3.16b,v16.16b
+       aesimc          v3.16b,v3.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_mainloop",3,0);
+$code.=<<___;
+       add             v23.4s,v7.4s,v29.4s
+       sha1h           s21,s24
+       ld1             {v29.16b},[x3],16       /* next w3 */
+       sha1p           q24,s22,v23.4s
+       add             v24.4s,v24.4s,v20.4s
+       eor             v3.16b,v3.16b,v31.16b   /* xor w/ prev value */
+       /* next aes block, update aes_ptr_in */
+       ld1             {v31.16b},[x0],16
+       add             v25.4s,v25.4s,v21.4s
+       /* save aes res, bump aes_out_ptr */
+       st1             {v3.16b},[x1],16
+       /* substract loaded bytes */
+       sub             x5,x5,64
+       /* loop if more to do */
+       cbnz            x15,.Ldec_main_loop
+       /*
+        * Now the loop epilog. Since the reads for sha have already been done
+        * in advance, we have to have an extra unwind.
+        * This is why the test for the short cases is 16 and not 12.
+        *
+        * The unwind, which is just the main loop without the tests or final reads.
+        */
+       rev32           v26.16b,v26.16b         /* fix endian w0 */
+       mov             v20.16b,v24.16b         /* working ABCD <- ABCD */
+       rev32           v27.16b,v27.16b         /* fix endian w1 */
+       /* pref next aes_ptr_out, streaming */
+       prfm            PLDL1KEEP,[x1,64]
+       /* aes xform 0, sha quad 0 */
+       aesd            v0.16b,v8.16b
+       aesimc          v0.16b,v0.16b
+       add             v19.4s,v4.4s,v26.4s
+       rev32           v28.16b,v28.16b         /* fix endian w2 */
+       sha1su0         v26.4s,v27.4s,v28.4s
+       /* read next aes block, no update */
+       ld1             {v1.16b},[x0]
+       aesd            v0.16b,v9.16b
+       aesimc          v0.16b,v0.16b
+       sha1h           s22,s24
+       aesd            v0.16b,v10.16b
+       aesimc          v0.16b,v0.16b
+       add             v23.4s,v4.4s,v27.4s
+       sha1c           q24,s25,v19.4s
+       aesd            v0.16b,v11.16b
+       aesimc          v0.16b,v0.16b
+       rev32           v29.16b,v29.16b         /* fix endian w3 */
+       sha1su1         v26.4s,v29.4s
+       aesd            v0.16b,v12.16b
+       aesimc          v0.16b,v0.16b
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       add             v19.4s,v4.4s,v28.4s
+       aesd            v0.16b,v13.16b
+       aesimc          v0.16b,v0.16b
+       sha1c           q24,s22,v23.4s
+       aesd            v0.16b,v14.16b
+       aesimc          v0.16b,v0.16b
+       add             v23.4s,v4.4s,v29.4s
+       sha1su1         v27.4s,v26.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       aesd            v0.16b,v15.16b
+       aesimc          v0.16b,v0.16b
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       aesd            v0.16b,v16.16b
+       aesimc          v0.16b,v0.16b
+       sha1su1         v28.4s,v27.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+___
+       &aes192_aes256_dec_handle(1,"dec_epilog",0,0);
+$code.=<<___;
+       add             v19.4s,v4.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+       add             v23.4s,v5.4s,v27.4s
+       eor             v0.16b,v0.16b,v30.16b   /* xor w/ prev value */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v30.16b},[x0],16
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       sha1su1         v26.4s,v29.4s
+       /* aes xform 1, sha quad 1 */
+       /* save aes res, bump aes_out_ptr */
+       st1             {v0.16b},[x1],16
+       sha1su0         v27.4s,v28.4s,v29.4s
+       aesd            v1.16b,v8.16b
+       aesimc          v1.16b,v1.16b
+       sha1h           s21,s24
+       add             v19.4s,v5.4s,v28.4s
+       sha1p           q24,s22,v23.4s
+       aesd            v1.16b,v9.16b
+       aesimc          v1.16b,v1.16b
+       add             v23.4s,v5.4s,v29.4s
+       sha1su1         v27.4s,v26.4s
+       aesd            v1.16b,v10.16b
+       aesimc          v1.16b,v1.16b
+       sha1su0         v28.4s,v29.4s,v26.4s
+       /* read next aes block, no update */
+       ld1             {v2.16b},[x0]
+       sha1h           s22,s24
+       aesd            v1.16b,v11.16b
+       aesimc          v1.16b,v1.16b
+       sha1p           q24,s21,v19.4s
+       aesd            v1.16b,v12.16b
+       aesimc          v1.16b,v1.16b
+       sha1su1         v28.4s,v27.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       aesd            v1.16b,v13.16b
+       aesimc          v1.16b,v1.16b
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       aesd            v1.16b,v14.16b
+       aesimc          v1.16b,v1.16b
+       add             v19.4s,v5.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+       aesd            v1.16b,v15.16b
+       aesimc          v1.16b,v1.16b
+       add             v23.4s,v5.4s,v27.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       aesd            v1.16b,v16.16b
+       aesimc          v1.16b,v1.16b
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       sha1su1         v26.4s,v29.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+___
+       &aes192_aes256_dec_handle(1,"dec_epilog",1,0);
+$code.=<<___;
+       eor             v1.16b,v1.16b,v31.16b   /* mode op 1 xor w/prev value */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v31.16b},[x0],16
+       add             v19.4s,v6.4s,v28.4s
+       add             v23.4s,v6.4s,v29.4s
+       sha1su1         v27.4s,v26.4s
+       /* mode op 2 */
+       /* aes xform 2, sha quad 2 */
+       aesd            v2.16b,v8.16b
+       aesimc          v2.16b,v2.16b
+       sha1su0         v28.4s,v29.4s,v26.4s
+       /* save aes res, bump aes_out_ptr */
+       st1             {v1.16b},[x1],16
+       aesd            v2.16b,v9.16b
+       aesimc          v2.16b,v2.16b
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       aesd            v2.16b,v10.16b
+       aesimc          v2.16b,v2.16b
+       sha1su1         v28.4s,v27.4s
+       add             v19.4s,v6.4s,v26.4s
+       aesd            v2.16b,v11.16b
+       aesimc          v2.16b,v2.16b
+       sha1su0         v29.4s,v26.4s,v27.4s
+       aesd            v2.16b,v12.16b
+       aesimc          v2.16b,v2.16b
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       aesd            v2.16b,v13.16b
+       aesimc          v2.16b,v2.16b
+       sha1su1         v29.4s,v28.4s
+       /* read next aes block, no update */
+       ld1             {v3.16b},[x0]
+       aesd            v2.16b,v14.16b
+       aesimc          v2.16b,v2.16b
+       add             v23.4s,v6.4s,v27.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       aesd            v2.16b,v15.16b
+       aesimc          v2.16b,v2.16b
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       aesd            v2.16b,v16.16b
+       aesimc          v2.16b,v2.16b
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v26.4s,v29.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       sha1su1         v27.4s,v26.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+___
+       &aes192_aes256_dec_handle(1,"dec_epilog",2,0);
+$code.=<<___;
+       eor             v2.16b,v2.16b,v30.16b   /* mode of 2 xor w/prev value */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v30.16b},[x0],16
+       sha1su1         v28.4s,v27.4s
+       add             v23.4s,v7.4s,v29.4s
+       /* mode op 3 */
+       /* aes xform 3, sha quad 3 */
+       aesd            v3.16b,v8.16b
+       aesimc          v3.16b,v3.16b
+       sha1su0         v29.4s,v26.4s,v27.4s
+       /* save aes res, bump aes_out_ptr */
+       st1             {v2.16b},[x1],16
+       aesd            v3.16b,v9.16b
+       aesimc          v3.16b,v3.16b
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       aesd            v3.16b,v10.16b
+       aesimc          v3.16b,v3.16b
+       sha1su1         v29.4s,v28.4s
+       add             v19.4s,v7.4s,v26.4s
+       aesd            v3.16b,v11.16b
+       aesimc          v3.16b,v3.16b
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       aesd            v3.16b,v12.16b
+       aesimc          v3.16b,v3.16b
+       /* read first aes block, no bump */
+       ld1             {v0.16b},[x0]
+       add             v23.4s,v7.4s,v27.4s
+       aesd            v3.16b,v13.16b
+       aesimc          v3.16b,v3.16b
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v7.4s,v28.4s
+       aesd            v3.16b,v14.16b
+       aesimc          v3.16b,v3.16b
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       aesd            v3.16b,v15.16b
+       aesimc          v3.16b,v3.16b
+       add             v23.4s,v7.4s,v29.4s
+       aesd            v3.16b,v16.16b
+       aesimc          v3.16b,v3.16b
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+___
+       &aes192_aes256_dec_handle(1,"dec_epilog",3,0);
+$code.=<<___;
+       eor             v3.16b,v3.16b,v31.16b   /* xor w/ prev value */
+       /* read first aes block, bump aes_ptr_in */
+       ld1             {v31.16b},[x0],16
+
+       add             v25.4s,v25.4s,v21.4s
+       add             v24.4s,v24.4s,v20.4s
+
+       /*
+        * now we have to do the 4 aes blocks (b-2) that catch up to where sha is
+        */
+
+       /* aes xform 0 */
+       aesd            v0.16b,v8.16b
+       aesimc          v0.16b,v0.16b
+       /* save aes res, bump aes_out_ptr */
+       st1             {v3.16b},[x1],16
+       aesd            v0.16b,v9.16b
+       aesimc          v0.16b,v0.16b
+       /* read next aes block, no update */
+       ld1             {v1.16b},[x0]
+       aesd            v0.16b,v10.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v11.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v12.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v13.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v14.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v15.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v16.16b
+       aesimc          v0.16b,v0.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_catchup",0,0);
+$code.=<<___;
+       eor             v0.16b,v0.16b,v30.16b   /* xor w/ ivec (modeop) */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v30.16b},[x0],16
+
+       /* aes xform 1 */
+       aesd            v1.16b,v8.16b
+       aesimc          v1.16b,v1.16b
+       /* read next aes block, no update */
+       ld1             {v2.16b},[x0]
+       aesd            v1.16b,v9.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v10.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v11.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v12.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v13.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v14.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v15.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v16.16b
+       aesimc          v1.16b,v1.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_catchup",1,0);
+$code.=<<___;
+       eor             v1.16b,v1.16b,v31.16b   /* xor w/ ivec (modeop) */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v31.16b},[x0],16
+
+       /* aes xform 2 */
+       aesd            v2.16b,v8.16b
+       aesimc          v2.16b,v2.16b
+       /* read next aes block, no update */
+       ld1             {v3.16b},[x0]
+       aesd            v2.16b,v9.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v10.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v11.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v12.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v13.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v14.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v15.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v16.16b
+       aesimc          v2.16b,v2.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_catchup",2,0);
+$code.=<<___;
+       eor             v2.16b,v2.16b,v30.16b   /* xor w/ ivec (modeop) */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v30.16b},[x0],16
+
+       /* aes xform 3 */
+       aesd            v3.16b,v8.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v9.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v10.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v11.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v12.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v13.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v14.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v15.16b
+       aesimc          v3.16b,v3.16b
+       eor             v26.16b,v26.16b,v26.16b         /* zero the rest */
+       eor             v27.16b,v27.16b,v27.16b         /* zero the rest */
+       aesd            v3.16b,v16.16b
+       aesimc          v3.16b,v3.16b
+       eor             v28.16b,v28.16b,v28.16b         /* zero the rest */
+       eor             v29.16b,v29.16b,v29.16b         /* zero the rest */
+___
+       &aes192_aes256_dec_handle(1,"dec_catchup",3,0);
+$code.=<<___;
+       eor             v3.16b,v3.16b,v31.16b   /* xor w/ ivec (modeop) */
+
+       add             x9,x9,4
+
+/*
+ * Now, there is the final b-1 sha1 padded block.
+ * This contains between 0-3 aes blocks. We take some pains to avoid read spill
+ * by only reading the blocks that are actually defined.
+ * this is also the final sha block code for the short_cases.
+ */
+.Ljoin_common:
+       mov             w15,0x80        /* that's the 1 of the pad */
+.Lpost_loop_Q0:
+       /* assume this was final block */
+       mov             v26.b[0],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       /* overwrite previous v26 value (0x80) */
+       mov             v26.d[0],x2
+       /* assume this was final block */
+       mov             v26.b[8],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       mov             v26.d[1],x2
+.Lpost_loop_Q1:
+       /* assume this is final block */
+       mov             v27.b[0],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       /* overwrite previous v27 value (0x80) */
+       mov             v27.d[0],x2
+       /* assume this was final block */
+       mov             v27.b[8],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       mov             v27.d[1],x2
+.Lpost_loop_Q2:
+       /* assume this was final block */
+       mov             v28.b[0],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       /* overwrite previous v28 value (0x80) */
+       mov             v28.d[0],x2
+       /* assume this was final block */
+       mov             v28.b[8],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       mov             v28.d[1],x2
+.Lpost_loop_Q3:
+       /* assume this was final block */
+       mov             v29.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v29 value (0x80) */
+       mov             v29.d[0],x2
+       /* assume this was final block */
+       mov             v29.b[11],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,1f
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       rev32           x2,x2
+       mov             v29.d[1],x2
+
+/*
+ * That is enough of blocks, we allow up to 64 bytes in total.
+ * Now we have the sha1 to do for these 4 16B blocks
+ */
+1:
+       rev32           v26.16b,v26.16b
+       rev32           v27.16b,v27.16b
+       rev32           v28.16b,v28.16b
+       //rev32         v29.16b,v29.16b
+
+       mov             v20.16b,v24.16b         /* working ABCD <- ABCD */
+
+       add             v19.4s,v4.4s,v26.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s25,v19.4s
+       add             v23.4s,v4.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v4.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v7.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v7.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       add             v23.4s,v7.4s,v27.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v19.4s,v7.4s,v28.4s
+       sha1h           s21,s24
+       eor             v26.16b,v26.16b,v26.16b         /* zero sha src 0 */
+       sha1p           q24,s22,v23.4s
+
+       add             v23.4s,v7.4s,v29.4s
+       sha1h           s22,s24
+       eor             v27.16b,v27.16b,v27.16b         /* zero sha src 1 */
+       sha1p           q24,s21,v19.4s
+
+       sha1h           s21,s24
+       eor             v28.16b,v28.16b,v28.16b         /* zero sha src 2 */
+       sha1p           q24,s22,v23.4s
+
+       add             v25.4s,v25.4s,v21.4s
+       eor             v29.16b,v29.16b,v29.16b         /* zero sha src 3 */
+       add             v24.4s,v24.4s,v20.4s
+
+       /* this was final block */
+       cbz             x5,.Lpost_loop
+       subs            x5,x5,8
+       /* loop if hash is not finished */
+       b.ne            .Lpost_loop_Q0
+       /* set "1" of the padding if this was a final block */
+       mov             v26.b[0],w15
+
+.Lpost_loop:
+       /* Add outstanding bytes of digest source */
+       add             x11,x11,x8
+       /* Add one SHA-1 block since hash is calculated including i_key_pad */
+       add             x11,x11,#64
+       lsr             x12,x11,32              /* len_hi */
+       and             x14,x11,0xffffffff      /* len_lo */
+       lsl             x12,x12,3               /* len_hi in bits */
+       lsl             x14,x14,3               /* len_lo in bits */
+
+       rev32           v26.16b,v26.16b         /* fix endian w0 */
+       mov             v29.s[3],w14            /* len_lo */
+       rev32           v27.16b,v27.16b         /* fix endian w1 */
+       mov             v29.s[2],w12            /* len_hi */
+       rev32           v28.16b,v28.16b         /* fix endian w2 */
+
+       mov             v20.16b,v24.16b         /* working ABCD <- ABCD */
+       /* skip write back if there were less than 4 AES blocks */
+       cbz             x9,1f
+       /*
+        * At this point all data should be fetched for SHA.
+        * Save remaining blocks without danger of overwriting SHA source.
+        */
+       stp             q0,q1,[x1],32
+       stp             q2,q3,[x1],32
+1:
+       /*
+        * final sha block
+        * The strategy is to combine the 0-3 aes blocks, which is faster but
+        * a little gourmand on code space.
+        */
+       cbz             x13,.Lzero_aes_blocks_left      /* none to do */
+       /* read first aes block, bump aes_ptr_in */
+       ld1             {v0.16b},[x0]
+       ld1             {v31.16b},[x0],16
+       aesd            v0.16b,v8.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v9.16b
+       aesimc          v0.16b,v0.16b
+       add             v19.4s,v4.4s,v26.4s
+       aesd            v0.16b,v10.16b
+       aesimc          v0.16b,v0.16b
+       add             v23.4s,v4.4s,v27.4s
+       aesd            v0.16b,v11.16b
+       aesimc          v0.16b,v0.16b
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       aesd            v0.16b,v12.16b
+       aesimc          v0.16b,v0.16b
+       sha1c           q24,s25,v19.4s
+       sha1su1         v26.4s,v29.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       aesd            v0.16b,v13.16b
+       aesimc          v0.16b,v0.16b
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       aesd            v0.16b,v14.16b
+       aesimc          v0.16b,v0.16b
+       sha1su1         v27.4s,v26.4s
+       add             v19.4s,v4.4s,v28.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       aesd            v0.16b,v15.16b
+       aesimc          v0.16b,v0.16b
+       sha1c           q24,s21,v19.4s
+       sha1su1         v28.4s,v27.4s
+       add             v23.4s,v4.4s,v29.4s
+       aesd            v0.16b,v16.16b
+       aesimc          v0.16b,v0.16b
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+___
+       &aes192_aes256_dec_handle(1,"dec_final1",0,0);
+$code.=<<___;
+       sha1su1         v29.4s,v28.4s
+       eor             v3.16b,v0.16b,v30.16b   /* xor w/ ivec (modeop) */
+       add             v19.4s,v4.4s,v26.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       /* save aes res, bump aes_out_ptr */
+       st1             {v3.16b},[x1],16
+       sha1su1         v26.4s,v29.4s
+       /* dec counter */
+       sub             x13,x13,1
+       cbz             x13,.Lfrmquad1
+
+       /* aes xform 1 */
+       /* read first aes block, bump aes_ptr_in */
+       ld1             {v0.16b},[x0]
+       ld1             {v30.16b},[x0],16
+       add             v23.4s,v5.4s,v27.4s
+       aesd            v0.16b,v8.16b
+       aesimc          v0.16b,v0.16b
+       add             v19.4s,v5.4s,v28.4s
+       aesd            v0.16b,v9.16b
+       aesimc          v0.16b,v0.16b
+       sha1su0         v27.4s,v28.4s,v29.4s
+       aesd            v0.16b,v10.16b
+       aesimc          v0.16b,v0.16b
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       aesd            v0.16b,v11.16b
+       aesimc          v0.16b,v0.16b
+       sha1su1         v27.4s,v26.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       aesd            v0.16b,v12.16b
+       aesimc          v0.16b,v0.16b
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       aesd            v0.16b,v13.16b
+       aesimc          v0.16b,v0.16b
+       sha1su1         v28.4s,v27.4s
+       add             v23.4s,v5.4s,v29.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       aesd            v0.16b,v14.16b
+       aesimc          v0.16b,v0.16b
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       aesd            v0.16b,v15.16b
+       aesimc          v0.16b,v0.16b
+       sha1su1         v29.4s,v28.4s
+       add             v19.4s,v5.4s,v26.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       aesd            v0.16b,v16.16b
+       aesimc          v0.16b,v0.16b
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+___
+       &aes192_aes256_dec_handle(1,"dec_final2",0,0);
+$code.=<<___;
+       sha1su1         v26.4s,v29.4s
+       eor             v3.16b,v0.16b,v31.16b   /* xor w/ ivec (modeop) */
+       add             v23.4s,v5.4s,v27.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       /* save aes res, bump aes_out_ptr */
+       st1             {v3.16b},[x1],16
+       sha1su1         v27.4s,v26.4s
+
+       sub             x13,x13,1               /* dec counter */
+       cbz             x13,.Lfrmquad2
+
+       /* aes xform 2 */
+       /* read first aes block, bump aes_ptr_in */
+       ld1             {v0.16b},[x0],16
+       add             v19.4s,v6.4s,v28.4s
+       aesd            v0.16b,v8.16b
+       aesimc          v0.16b,v0.16b
+       add             v23.4s,v6.4s,v29.4s
+       aesd            v0.16b,v9.16b
+       aesimc          v0.16b,v0.16b
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       aesd            v0.16b,v10.16b
+       aesimc          v0.16b,v0.16b
+       sha1m           q24,s21,v19.4s
+       sha1su1         v28.4s,v27.4s
+       aesd            v0.16b,v11.16b
+       aesimc          v0.16b,v0.16b
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       aesd            v0.16b,v12.16b
+       aesimc          v0.16b,v0.16b
+       sha1m           q24,s22,v23.4s
+       sha1su1         v29.4s,v28.4s
+       aesd            v0.16b,v13.16b
+       aesimc          v0.16b,v0.16b
+       add             v19.4s,v6.4s,v26.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       aesd            v0.16b,v14.16b
+       aesimc          v0.16b,v0.16b
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       aesd            v0.16b,v15.16b
+       aesimc          v0.16b,v0.16b
+       sha1su1         v26.4s,v29.4s
+       add             v23.4s,v6.4s,v27.4s
+       aesd            v0.16b,v16.16b
+       aesimc          v0.16b,v0.16b
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+___
+       &aes192_aes256_dec_handle(1,"dec_final3",0,0);
+$code.=<<___;
+       sha1su1         v27.4s,v26.4s
+       eor             v3.16b,v0.16b,v30.16b   /* xor w/ ivec (modeop) */
+       add             v19.4s,v6.4s,v28.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       /* save aes res, bump aes_out_ptr */
+       st1             {v3.16b},[x1],16
+       sha1su1         v28.4s,v27.4s
+       b               .Lfrmquad3
+
+/*
+ * The final block with no aes component, i.e from here there were zero blocks
+ */
+.Lzero_aes_blocks_left:
+
+       add             v19.4s,v4.4s,v26.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s25,v19.4s
+       add             v23.4s,v4.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v4.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       sha1su1         v26.4s,v29.4s
+
+/* quad 1 */
+.Lfrmquad1:
+       add             v23.4s,v5.4s,v27.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       sha1su1         v27.4s,v26.4s
+
+/* quad 2 */
+.Lfrmquad2:
+       add             v19.4s,v6.4s,v28.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       sha1su1         v28.4s,v27.4s
+
+/* quad 3 */
+.Lfrmquad3:
+       add             v23.4s,v7.4s,v29.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v7.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       add             v23.4s,v7.4s,v27.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v19.4s,v7.4s,v28.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v23.4s,v7.4s,v29.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v26.4s,v24.4s,v20.4s
+       add             v27.4s,v25.4s,v21.4s
+
+       /* calculate final HMAC */
+       eor             v28.16b, v28.16b, v28.16b
+       eor             v29.16b, v29.16b, v29.16b
+       /* load o_key_pad partial hash */
+       ldr             q24, [x7]
+       eor             v25.16b, v25.16b, v25.16b
+       ldr             s25, [x7, #16]
+       /* working ABCD <- ABCD */
+       mov             v20.16b,v24.16b
+
+       /* set padding 1 to the first reg */
+       mov             w11, #0x80              /* that's the 1 of the pad */
+       mov             v27.b[7], w11
+       /* size of o_key_pad + inner hash */
+       mov             x11, #64+20
+       /* move length to the end of the block */
+       lsl             x11, x11, 3
+       mov             v29.s[3], w11
+       lsr             x11, x11, 32
+       mov             v29.s[2], w11           /* and the higher part */
+
+       add             v19.4s,v4.4s,v26.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s25,v19.4s
+       add             v23.4s,v4.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v4.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v7.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v7.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       add             v23.4s,v7.4s,v27.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v19.4s,v7.4s,v28.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       ldp             d10,d11,[sp,#16]
+       ldp             d12,d13,[sp,#32]
+
+       add             v23.4s,v7.4s,v29.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       ldp             d14,d15,[sp,#48]
+       ldp             d8,d9,[sp],#64
+
+       mov             x0, xzr
+
+       add             v24.4s,v24.4s,v20.4s
+       add             v25.4s,v25.4s,v21.4s
+
+       rev32           v24.16b, v24.16b
+       rev32           v25.16b, v25.16b
+
+       st1             {v24.16b}, [x4],16
+       st1             {v25.s}[0], [x4]
+
+       ret
+
+/*
+ * These are the short cases (less efficient), here used for 1-11 aes blocks.
+ * x10 = aes_blocks
+ */
+.Ldec_short_cases:
+       ldp             q8,q9,[x9],32
+       adr             x8,.Lrcon               /* rcon */
+       ldp             q10,q11,[x9],32
+       lsl             x11,x10,4               /* len = aes_blocks*16 */
+
+       ldp             q12,q13,[x9],32
+       ldp             q4,q5,[x8],32           /* key0, key1 */
+       ldp             q14,q15,[x9],32
+       ld1             {v30.16b},[x6]          /* get ivec */
+       ldp             q16,q17,[x9],32
+       ldp             q6,q7,[x8]              /* key2, key3 */
+       ld1             {v18.16b},[x9]
+
+       /* get outstanding bytes of the digest */
+       sub             x8,x5,x2
+
+       /* indicate AES blocks to write back */
+       mov             x9,xzr
+
+       mov             x2,x0
+       /*
+        * Digest source has to be at least of cipher source length
+        * therefore it is safe to use x10 to indicate whether we can
+        * overtake cipher processing by 4 AES block here.
+        */
+       cmp             x10,4                   /* check if 4 or more */
+       /* if less, bail to last block */
+       blt             .Llast_sha_block
+
+       ldp             q26,q27,[x3],32
+       rev32   v26.16b,v26.16b
+       rev32   v27.16b,v27.16b
+       ldp             q28,q29,[x3],32
+       rev32   v28.16b,v28.16b
+       rev32   v29.16b,v29.16b
+
+       sub             x5,x5,64
+
+       mov             v20.16b,v24.16b         /* working ABCD <- ABCD */
+
+       /* quad 0 */
+       add             v19.4s,v4.4s,v26.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s25,v19.4s
+       add             v23.4s,v4.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v4.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       /* quad 1 */
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       /* quad 2 */
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v7.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v7.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       /* quad 3 */
+       add             v23.4s,v7.4s,v27.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v19.4s,v7.4s,v28.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v23.4s,v7.4s,v29.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v25.4s,v25.4s,v21.4s
+       add             v24.4s,v24.4s,v20.4s
+
+       /* there were at least 4 AES blocks to process */
+       b               .Lshort_loop_no_store
+
+.Ldec_short_loop:
+       cmp             x10,4                   /* check if 4 or more */
+       /* if less, bail to last block */
+       blt             .Llast_sha_block
+
+       stp             q0,q1,[x1],32
+       stp             q2,q3,[x1],32
+
+       sub             x9,x9,4
+
+.Lshort_loop_no_store:
+
+       ld1             {v31.16b},[x2]          /* next w no update */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v0.16b},[x2],16
+
+       add             x0,x0,64
+
+       /* aes xform 0 */
+       aesd            v0.16b,v8.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v9.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v10.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v11.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v12.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v13.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v14.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v15.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v16.16b
+       aesimc          v0.16b,v0.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_short",0,0);
+$code.=<<___;
+       eor             v0.16b,v0.16b,v30.16b   /* xor w/ prev value */
+
+       ld1             {v30.16b},[x2]          /* read no update */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v1.16b},[x2],16
+
+       /* aes xform 1 */
+       aesd            v1.16b,v8.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v9.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v10.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v11.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v12.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v13.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v14.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v15.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v16.16b
+       aesimc          v1.16b,v1.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_short",1,0);
+$code.=<<___;
+       eor             v1.16b,v1.16b,v31.16b   /* xor w/ prev value */
+
+       ld1             {v31.16b},[x2]          /* read no update */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v2.16b},[x2],16
+
+       /* aes xform 2 */
+       aesd            v2.16b,v8.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v9.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v10.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v11.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v12.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v13.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v14.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v15.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v16.16b
+       aesimc          v2.16b,v2.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_short",2,0);
+$code.=<<___;
+       eor             v2.16b,v2.16b,v30.16b   /* xor w/ prev value */
+
+       ld1             {v30.16b},[x2]          /* read no update */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v3.16b},[x2],16
+
+       /* aes xform 3 */
+       aesd            v3.16b,v8.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v9.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v10.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v11.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v12.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v13.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v14.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v15.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v16.16b
+       aesimc          v3.16b,v3.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_short",3,0);
+$code.=<<___;
+       eor             v3.16b,v3.16b,v31.16b   /* xor w/ prev value */
+
+       add             x9,x9,4
+
+       sub             x10,x10,4               /* 4 less */
+       cmp             x5,64
+       b.lt            .Ldec_short_loop        /* keep looping */
+
+       ldp             q26,q27,[x3],32
+       rev32           v26.16b,v26.16b
+       rev32           v27.16b,v27.16b
+       ldp             q28,q29,[x3],32
+       rev32           v28.16b,v28.16b
+       rev32           v29.16b,v29.16b
+
+       sub             x5,x5,64
+
+       mov             v20.16b,v24.16b         /* working ABCD <- ABCD */
+
+       /* quad 0 */
+       add             v19.4s,v4.4s,v26.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s25,v19.4s
+       add             v23.4s,v4.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v4.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1c           q24,s22,v23.4s
+       add             v19.4s,v4.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1c           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       /* quad 1 */
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v5.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+       add             v23.4s,v5.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       /* quad 2 */
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v6.4s,v27.4s
+       sha1su1         v26.4s,v29.4s
+
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1h           s21,s24
+       sha1m           q24,s22,v23.4s
+       add             v19.4s,v6.4s,v28.4s
+       sha1su1         v27.4s,v26.4s
+
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1h           s22,s24
+       sha1m           q24,s21,v19.4s
+       add             v23.4s,v7.4s,v29.4s
+       sha1su1         v28.4s,v27.4s
+
+       /* quad 3 */
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+       add             v19.4s,v7.4s,v26.4s
+       sha1su1         v29.4s,v28.4s
+
+       add             v23.4s,v7.4s,v27.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       add             v19.4s,v7.4s,v28.4s
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v23.4s,v7.4s,v29.4s
+       sha1h           s22,s24
+       sha1p           q24,s21,v19.4s
+
+       sha1h           s21,s24
+       sha1p           q24,s22,v23.4s
+
+       add             v25.4s,v25.4s,v21.4s
+       add             v24.4s,v24.4s,v20.4s
+
+       b               .Ldec_short_loop                /* keep looping */
+/*
+ * This is arranged so that we can join the common unwind code
+ * that does the last sha block and the final 0-3 aes blocks
+ */
+.Llast_sha_block:
+       eor             v26.16b,v26.16b,v26.16b         /* zero the rest */
+       eor             v27.16b,v27.16b,v27.16b         /* zero the rest */
+       eor             v28.16b,v28.16b,v28.16b         /* zero the rest */
+       eor             v29.16b,v29.16b,v29.16b         /* zero the rest */
+
+       mov             x13,x10                 /* copy aes blocks for common */
+       b               .Ljoin_common           /* join common code */
+
+.size  asm_sha1_hmac_aescbc_dec, .-asm_sha1_hmac_aescbc_dec
+___
+
+if ($flavour =~ /64/) {
+       foreach(split("\n",$code)) {
+       s/\`([^\`]*)\`/eval($1)/geo;
+       print $_,"\n";
+       }
+}
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/aes/asm/aes-sha256-armv8.pl b/crypto/aes/asm/aes-sha256-armv8.pl
new file mode 100644 (file)
index 0000000..766b2ef
--- /dev/null
@@ -0,0 +1,4631 @@
+#! /usr/bin/env perl
+
+# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (C) Cavium networks Ltd. 2016.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#========================================================================
+# Derived from following files in
+# https://github.com/ARM-software/AArch64cryptolib
+# AArch64cryptolib_opt_big/aes_cbc_sha256/aes128cbc_sha256_hmac.S
+# AArch64cryptolib_opt_big/aes_cbc_sha256/sha256_hmac_aes128cbc_dec.S
+#========================================================================
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+$code=<<___;
+#include "arm_arch.h"
+
+# Theses are offsets into the CIPH_DIGEST struct
+#define CIPHER_KEY     0
+#define CIPHER_KEY_ROUNDS      8
+#define CIPHER_IV      16
+#define HMAC_IKEYPAD   24
+#define HMAC_OKEYPAD   32
+
+.text
+.arch armv8-a+crypto
+___
+
+sub aes192_aes256_handle () {
+       my $compare = shift;
+       my $label = shift;
+       my $i = shift;
+       my $load_rk10 = shift;
+
+       if($compare == 1) {
+$code.=<<___;
+       cmp     x16,#12
+___
+       }
+$code.=<<___;
+       b.lt    .Laes128_${label}_$i
+.Laes192_${label}_$i:
+       ldp     q30,q31,[x17],32        /* rk[10],rk[11] */
+       aese    v$i.16b,v17.16b
+       aesmc   v$i.16b,v$i.16b
+       aese    v$i.16b,v30.16b
+       aesmc   v$i.16b,v$i.16b
+       b.gt    .Laes256_${label}_$i
+       ld1     {v30.16b},[x17]         /* rk[12] */
+       aese    v$i.16b,v31.16b
+       eor     v$i.16b,v$i.16b,v30.16b
+       sub     x17, x17, #32           /* rewind x17 */
+       b       1f
+.Laes256_${label}_$i:
+       aese    v$i.16b,v31.16b
+       aesmc   v$i.16b,v$i.16b
+       ldp     q30,q31,[x17],32        /* rk[12],rk[13] */
+       aese    v$i.16b,v30.16b
+       aesmc   v$i.16b,v$i.16b
+       ld1     {v30.16b},[x17]         /* rk[14] */
+       aese    v$i.16b,v31.16b
+       eor     v$i.16b,v$i.16b,v30.16b
+       sub     x17, x17, #64           /* rewind x17 */
+       b       1f
+.Laes128_${label}_$i:
+___
+       if ($load_rk10 == 1) {
+$code.=<<___;
+       ld1     {v18.16b},[x9]
+___
+       }
+$code.=<<___;
+       aese    v$i.16b,v17.16b
+       eor     v$i.16b,v$i.16b,v18.16b /* res 0 */
+1:
+___
+}
+
+sub aes192_aes256_dec_handle () {
+       my $compare = shift;
+       my $label = shift;
+       my $i = shift;
+       my $load_rk10 = shift;
+
+       if($compare == 1) {
+$code.=<<___;
+       cmp     x16,#12
+___
+       }
+$code.=<<___;
+       b.lt    .Laes128_${label}_$i
+.Laes192_${label}_$i:
+       stp     q19,q23,[sp, #-32]!
+       ld1     {v19.16b},[x17],16      /* rk[10] */
+       ld1     {v23.16b},[x17],16      /* rk[11] */
+       aesd    v$i.16b,v17.16b
+       aesimc  v$i.16b,v$i.16b
+       aesd    v$i.16b,v19.16b
+       aesimc  v$i.16b,v$i.16b
+       b.gt    .Laes256_${label}_$i
+       ld1     {v19.16b},[x17]         /* rk[12] */
+       aesd    v$i.16b,v23.16b
+       eor     v$i.16b,v$i.16b,v19.16b
+       sub     x17, x17, #32           /* rewind x17 */
+       ldp     q19,q23,[sp], #32
+       b       1f
+.Laes256_${label}_$i:
+       aesd    v$i.16b,v23.16b
+       aesimc  v$i.16b,v$i.16b
+       ld1     {v19.16b},[x17],16      /* rk[12] */
+       ld1     {v23.16b},[x17],16      /* rk[13] */
+       aesd    v$i.16b,v19.16b
+       aesimc  v$i.16b,v$i.16b
+       ld1     {v19.16b},[x17]         /* rk[14] */
+       aesd    v$i.16b,v23.16b
+       eor     v$i.16b,v$i.16b,v19.16b
+       sub     x17, x17, #64           /* rewind x17 */
+       ldp     q19,q23,[sp], #32
+       b       1f
+.Laes128_${label}_$i:
+___
+       if ($load_rk10 == 1) {
+$code.=<<___;
+       ld1     {v18.16b},[x9]
+___
+       }
+$code.=<<___;
+       aesd    v$i.16b,v17.16b
+       eor     v$i.16b,v$i.16b,v18.16b /* res 0 */
+1:
+___
+}
+
+$code.=<<___;
+# Description:
+#
+# Combined Enc/Auth Primitive = aes128cbc/sha256_hmac
+#
+# Operations:
+#
+# out = encrypt-AES128CBC(in)
+# return_hash_ptr = SHA256(o_key_pad | SHA256(i_key_pad | out))
+#
+# Prototype:
+# void asm_aescbc_sha256_hmac(uint8_t *csrc, uint8_t *cdst, uint64_t clen,
+#                      uint8_t *dsrc, uint8_t *ddst, uint64_t dlen,
+#                      CIPH_DIGEST *arg)
+#
+# Registers used:
+#
+# asm_aescbc_sha256_hmac(
+#      csrc,   x0      (cipher src address)
+#      cdst,   x1      (cipher dst address)
+#      clen    x2      (cipher length)
+#      dsrc,   x3      (digest src address)
+#      ddst,   x4      (digest dst address)
+#      dlen,   x5      (digest length)
+#      arg     x6      :
+#              arg->cipher.key                 (round keys)
+#              arg->cipher.key_rounds          (key rounds)
+#              arg->cipher.iv                  (initialization vector)
+#              arg->digest.hmac.i_key_pad      (partially hashed i_key_pad)
+#              arg->digest.hmac.o_key_pad      (partially hashed o_key_pad)
+#      )
+#
+# Routine register definitions:
+#
+# v0  -- v3 -- aes results
+# v4  -- v7 -- round consts for sha
+# v8  -- v18 -- round keys
+# v19 -- v20 -- round keys
+# v21 -- ABCD tmp
+# v22 -- sha working state ABCD (q22)
+# v23 -- sha working state EFGH (q23)
+# v24 -- sha state ABCD
+# v25 -- sha state EFGH
+# v26 -- sha block 0
+# v27 -- sha block 1
+# v28 -- sha block 2
+# v29 -- sha block 3
+# v30 -- reserved
+# v31 -- reserved
+#
+# Constraints:
+#
+# The variable "clen" must be a multiple of 16, otherwise results
+# are not defined. For AES partial blocks the user is required
+# to pad the input to modulus 16 = 0.
+# The variable "dlen" must be a multiple of 8 and greater or equal
+# to "clen". This constrain is strictly related to the needs of the IPSec
+# ESP packet. Encrypted payload is hashed along with the 8 byte ESP header,
+# forming ICV. Speed gain is achieved by doing both things at the same time,
+# hence lengths are required to match at least at the cipher level.
+#
+# Short lengths are not optimized at < 12 AES blocks
+
+.global        asm_aescbc_sha256_hmac
+.type  asm_aescbc_sha256_hmac,%function
+
+.align 4
+.Lrcon:
+       .word   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+       .word   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+       .word   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+       .word   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+       .word   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+       .word   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+       .word   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+       .word   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+       .word   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+       .word   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+       .word   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+       .word   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+       .word   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+       .word   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+       .word   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+       .word   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+.Linit_sha_state:
+       .word   0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
+       .word   0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+
+asm_aescbc_sha256_hmac:
+       AARCH64_VALID_CALL_TARGET
+       /* protect registers */
+       stp             d8,d9,[sp,#-64]!
+       /* fetch args */
+       ldr             x7, [x6, #HMAC_IKEYPAD]
+       /* init ABCD, EFGH. */
+       ldp             q24,q25,[x7]
+       /* save pointer to o_key_pad partial hash */
+       ldr             x7, [x6, #HMAC_OKEYPAD]
+
+       stp             d10,d11,[sp,#16]
+
+       /* address of sha init state consts */
+       adr             x12,.Linit_sha_state
+       prfm            PLDL1KEEP,[x1,0]        /* pref next aes_ptr_out */
+       lsr             x10,x2,4                /* aes_blocks = len/16 */
+
+       stp             d12,d13,[sp,#32]
+       stp             d14,d15,[sp,#48]
+
+       ldr             x9, [x6, #CIPHER_KEY]
+       ldr             x16, [x6, #CIPHER_KEY_ROUNDS]
+       ldr             x6, [x6, #CIPHER_IV]
+       add             x17, x9, #160           /* point to the last 5 rounds keys */
+
+       /*
+        * Init sha state, prefetch, check for small cases.
+        * Note that the output is prefetched as a load, for the in-place case
+       */
+       prfm            PLDL1KEEP,[x0,0]        /* pref next aes_ptr_in */
+       cmp             x10,12                  /* no main loop if <12 */
+       b.lt            .Lenc_short_cases       /* branch if < 12 */
+
+       /* proceed */
+       ld1             {v3.16b},[x6]           /* get 1st ivec */
+       /* read first aes block, bump aes_ptr_in */
+       ld1             {v0.16b},[x0],16
+       mov             x11,x2                  /* len -> x11 needed at end */
+       lsr             x12,x11,6               /* total_blocks */
+       /*
+        * now we can do the loop prolog, 1st aes sequence of 4 blocks
+        */
+       ld1             {v8.16b},[x9],16        /* rk[0] */
+       ld1             {v9.16b},[x9],16        /* rk[1] */
+       eor             v0.16b,v0.16b,v3.16b    /* xor w/ ivec (modeop) */
+       ld1             {v10.16b},[x9],16       /* rk[2] */
+
+       /* aes xform 0 */
+       aese            v0.16b,v8.16b
+       aesmc           v0.16b,v0.16b
+       prfm            PLDL1KEEP,[x0,64]       /* pref next aes_ptr_in */
+       ld1             {v11.16b},[x9],16       /* rk[3] */
+       aese            v0.16b,v9.16b
+       aesmc           v0.16b,v0.16b
+       prfm            PLDL1KEEP,[x1,64]       /* pref next aes_ptr_out  */
+       /* base address for sha round consts */
+       adr             x8,.Lrcon
+       ld1             {v12.16b},[x9],16       /* rk[4] */
+       aese            v0.16b,v10.16b
+       aesmc           v0.16b,v0.16b
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v1.16b},[x0],16
+       ld1             {v13.16b},[x9],16       /* rk[5] */
+       aese            v0.16b,v11.16b
+       aesmc           v0.16b,v0.16b
+       ld1             {v14.16b},[x9],16       /* rk[6] */
+       aese            v0.16b,v12.16b
+       aesmc           v0.16b,v0.16b
+       ld1             {v15.16b},[x9],16       /* rk[7] */
+       aese            v0.16b,v13.16b
+       aesmc           v0.16b,v0.16b
+       ld1             {v16.16b},[x9],16       /* rk[8] */
+       aese            v0.16b,v14.16b
+       aesmc           v0.16b,v0.16b
+       ld1             {v17.16b},[x9],16       /* rk[9] */
+       aese            v0.16b,v15.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v16.16b
+       aesmc           v0.16b,v0.16b
+___
+       &aes192_aes256_handle(1, "enc_prolog", 0, 1);
+$code.=<<___;
+       eor             v1.16b,v1.16b,v0.16b    /* xor w/ ivec (modeop) */
+
+/* aes xform 1 */
+       aese            v1.16b,v8.16b
+       aesmc           v1.16b,v1.16b
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v2.16b},[x0],16
+       aese            v1.16b,v9.16b
+       aesmc           v1.16b,v1.16b
+       prfm            PLDL1KEEP,[x8,0*64]     /* rcon */
+       aese            v1.16b,v10.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v11.16b
+       aesmc           v1.16b,v1.16b
+       /* save aes res, bump aes_out_ptr */
+       st1             {v0.16b},[x1],16
+       ld1             {v26.16b},[x3],16
+       aese            v1.16b,v12.16b
+       aesmc           v1.16b,v1.16b
+       prfm            PLDL1KEEP,[x8,2*64]     /* rcon */
+       aese            v1.16b,v13.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v14.16b
+       aesmc           v1.16b,v1.16b
+       prfm            PLDL1KEEP,[x8,4*64]     /* rcon */
+       aese            v1.16b,v15.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v16.16b
+       aesmc           v1.16b,v1.16b
+       prfm            PLDL1KEEP,[x8,6*64]     /* rcon */
+___
+    &aes192_aes256_handle(0, "enc_prolog", 1, 0);
+$code.=<<___;
+       prfm            PLDL1KEEP,[x8,8*64]     /* rcon */
+       eor             v2.16b,v2.16b,v1.16b    /* xor w/ ivec (modeop) */
+
+       /* aes xform 2 */
+       aese            v2.16b,v8.16b
+       aesmc           v2.16b,v2.16b
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v3.16b},[x0],16
+       aese            v2.16b,v9.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v10.16b
+       aesmc           v2.16b,v2.16b
+       prfm            PLDL1KEEP,[x8,10*64]    /* rcon */
+       aese            v2.16b,v11.16b
+       aesmc           v2.16b,v2.16b
+       /* save aes res, bump aes_out_ptr */
+       st1             {v1.16b},[x1],16
+       ld1             {v27.16b},[x3],16
+       aese            v2.16b,v12.16b
+       aesmc           v2.16b,v2.16b
+       prfm            PLDL1KEEP,[x8,12*64]    /* rcon */
+       aese            v2.16b,v13.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v14.16b
+       aesmc           v2.16b,v2.16b
+       prfm            PLDL1KEEP,[x8,14*64]    /* rcon */
+       aese            v2.16b,v15.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v16.16b
+       aesmc           v2.16b,v2.16b
+___
+    &aes192_aes256_handle(0, "enc_prolog", 2, 0);
+$code.=<<___;
+       eor             v3.16b,v3.16b,v2.16b    /* xor w/ivec (modeop) */
+
+       /* aes xform 3 */
+       aese            v3.16b,v8.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v9.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v10.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v11.16b
+       aesmc           v3.16b,v3.16b
+       /* save aes res, bump aes_out_ptr */
+       st1             {v2.16b},[x1],16
+       ld1             {v28.16b},[x3],16
+       aese            v3.16b,v12.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v13.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v14.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v15.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v16.16b
+       aesmc           v3.16b,v3.16b
+       sub             x15,x12,1               /* main_blocks = total_blocks - 1 */
+       and             x13,x10,3               /* aes_blocks_left */
+___
+    &aes192_aes256_handle(0, "enc_prolog", 3, 0);
+$code.=<<___;
+       /*
+        * Note, aes_blocks_left := number after the main (sha)
+        * block is done. Can be 0
+        */
+       /* save aes res, bump aes_out_ptr */
+       st1             {v3.16b},[x1],16
+       ld1             {v29.16b},[x3],16
+
+       /* get outstanding bytes of the digest */
+       sub             x12,x5,x2
+       /* substract loaded bytes */
+       sub             x5,x5,64
+
+       /*
+        * main combined loop CBC
+        */
+.Lenc_main_loop:
+       /* base address for sha round consts */
+       adr             x8,.Lrcon
+       /*
+        * Because both mov, rev32 and eor have a busy cycle,this takes longer
+        * than it looks. That's OK since there are 6 cycles before we can use
+        * the load anyway; so this goes as fast as it can without SW
+        * pipelining(too complicated given the code size)
+        */
+       rev32           v26.16b,v26.16b
+       /* next aes block, update aes_ptr_in */
+       ld1             {v0.16b},[x0],16
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       prfm            PLDL1KEEP,[x9,64]       /* pref next lead_ptr */
+       rev32           v27.16b,v27.16b
+       /* pref next aes_ptr_out, streaming  */
+       prfm            PLDL1KEEP,[x1,64]
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+       ld1             {v4.16b},[x8],16        /* key0 */
+       eor             v0.16b,v0.16b,v3.16b    /* xor w/ prev value */
+       ld1             {v5.16b},[x8],16        /* key1 */
+       /*
+        * aes xform 0, sha quad 0
+        */
+       aese            v0.16b,v8.16b
+       aesmc           v0.16b,v0.16b
+       ld1             {v6.16b},[x8],16        /* key2 */
+       rev32           v28.16b,v28.16b
+       ld1             {v7.16b},[x8],16        /* key3  */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v1.16b},[x0],16
+       aese            v0.16b,v9.16b
+       aesmc           v0.16b,v0.16b
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256su0       v26.4s,v27.4s
+       aese            v0.16b,v10.16b
+       aesmc           v0.16b,v0.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       aese            v0.16b,v11.16b
+       aesmc           v0.16b,v0.16b
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       /* no place to get rid of this stall */
+       rev32           v29.16b,v29.16b
+       sha256h2        q23, q21, v4.4s
+       aese            v0.16b,v12.16b
+       aesmc           v0.16b,v0.16b
+       sha256su1       v26.4s,v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       ld1             {v4.16b},[x8],16        /* key4 */
+       sha256su0       v27.4s,v28.4s
+       aese            v0.16b,v13.16b
+       aesmc           v0.16b,v0.16b
+       sha256h         q22, q23, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256h2        q23, q21, v5.4s
+       aese            v0.16b,v14.16b
+       aesmc           v0.16b,v0.16b
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+       sha256su0       v28.4s,v29.4s
+       aese            v0.16b,v15.16b
+       aesmc           v0.16b,v0.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       aese            v0.16b,v16.16b
+       aesmc           v0.16b,v0.16b
+       sha256su1       v28.4s,v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256su0       v29.4s,v26.4s
+       sha256h         q22, q23, v7.4s
+___
+       &aes192_aes256_handle(1, "enc_mainloop", 0, 0);
+$code.=<<___;
+       sha256h2        q23, q21, v7.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+
+       /* aes xform 1, sha quad 1 */
+       sha256su0       v26.4s,v27.4s
+       eor             v1.16b,v1.16b,v0.16b    /* mode op 1 xor w/prev value */
+       ld1             {v7.16b},[x8],16        /* key7  */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       /* save aes res, bump aes_out_ptr */
+       st1             {v0.16b},[x1],16
+       aese            v1.16b,v8.16b
+       aesmc           v1.16b,v1.16b
+       sha256h         q22, q23, v4.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256h2        q23, q21, v4.4s
+       sha256su1       v26.4s,v28.4s,v29.4s
+       aese            v1.16b,v9.16b
+       aesmc           v1.16b,v1.16b
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       aese            v1.16b,v10.16b
+       aesmc           v1.16b,v1.16b
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v2.16b},[x0],16
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       aese            v1.16b,v11.16b
+       aesmc           v1.16b,v1.16b
+       ld1             {v5.16b},[x8],16        /* key5 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256su0       v28.4s,v29.4s
+       sha256h         q22, q23, v6.4s
+       aese            v1.16b,v12.16b
+       aesmc           v1.16b,v1.16b
+       sha256h2        q23, q21, v6.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+       sha256su0       v29.4s,v26.4s
+       aese            v1.16b,v13.16b
+       aesmc           v1.16b,v1.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       aese            v1.16b,v14.16b
+       aesmc           v1.16b,v1.16b
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+       aese            v1.16b,v15.16b
+       aesmc           v1.16b,v1.16b
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       aese            v1.16b,v16.16b
+       aesmc           v1.16b,v1.16b
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+___
+       &aes192_aes256_handle(0, "enc_mainloop", 1, 0);
+$code.=<<___;
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+
+       /* mode op 2 */
+       eor             v2.16b,v2.16b,v1.16b    /* mode of 2 xor w/prev value */
+
+       /* aes xform 2, sha quad 2 */
+       sha256su0       v26.4s,v27.4s
+       aese            v2.16b,v8.16b
+       aesmc           v2.16b,v2.16b
+       /* save aes res, bump aes_out_ptr */
+       st1             {v1.16b},[x1],16
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       aese            v2.16b,v9.16b
+       aesmc           v2.16b,v2.16b
+       sha256su1       v26.4s,v28.4s,v29.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       sha256su0       v27.4s,v28.4s
+       aese            v2.16b,v10.16b
+       aesmc           v2.16b,v2.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       aese            v2.16b,v11.16b
+       aesmc           v2.16b,v2.16b
+       sha256su1       v27.4s,v29.4s,v26.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       sha256su0       v28.4s,v29.4s
+       aese            v2.16b,v12.16b
+       aesmc           v2.16b,v2.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       aese            v2.16b,v13.16b
+       aesmc           v2.16b,v2.16b
+       sha256su1       v28.4s,v26.4s,v27.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256su0       v29.4s,v26.4s
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v3.16b},[x0],16
+       aese            v2.16b,v14.16b
+       aesmc           v2.16b,v2.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       aese            v2.16b,v15.16b
+       aesmc           v2.16b,v2.16b
+       sha256su1       v29.4s,v27.4s,v28.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       ld1             {v6.16b},[x8],16        /* key6 */
+       ld1             {v7.16b},[x8],16        /* key7 */
+       aese            v2.16b,v16.16b
+       aesmc           v2.16b,v2.16b
+___
+       &aes192_aes256_handle(0, "enc_mainloop", 2, 0);
+$code.=<<___;
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+
+       /* mode op 3 */
+       eor             v3.16b,v3.16b,v2.16b    /* xor w/prev value */
+
+       /* aes xform 3, sha quad 3 (hash only) */
+       aese            v3.16b,v8.16b
+       aesmc           v3.16b,v3.16b
+       /* save aes res, bump aes_out_ptr */
+       st1             {v2.16b},[x1],16
+       aese            v3.16b,v9.16b
+       aesmc           v3.16b,v3.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       aese            v3.16b,v10.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v11.16b
+       aesmc           v3.16b,v3.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       aese            v3.16b,v12.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v13.16b
+       aesmc           v3.16b,v3.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       aese            v3.16b,v14.16b
+       aesmc           v3.16b,v3.16b
+       sub             x15,x15,1               /* dec block count */
+       aese            v3.16b,v15.16b
+       aesmc           v3.16b,v3.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       aese            v3.16b,v16.16b
+       aesmc           v3.16b,v3.16b
+___
+       &aes192_aes256_handle(0, "enc_mainloop", 3, 0);
+$code.=<<___;
+       add             v24.4s,v24.4s,v22.4s    /* ABCD += working copy */
+       add             v25.4s,v25.4s,v23.4s    /* EFGH += working copy */
+       /* save aes res, bump aes_out_ptr */
+       st1             {v3.16b},[x1],16
+
+       ldp             q26,q27,[x3],32
+       ldp             q28,q29,[x3],32
+       sub             x5,x5,64
+
+       cbnz            x15,.Lenc_main_loop     /* loop if more to do */
+
+       mov             w15,0x80                /* that's the 1 of the pad */
+       /*
+        * epilog, process remaining aes blocks and b-2 sha block
+        * do this inline (no loop) to overlap with the sha part
+        * note there are 0-3 aes blocks left.
+        */
+       rev32           v26.16b,v26.16b         /* fix endian w0 */
+       rev32           v27.16b,v27.16b         /* fix endian w1 */
+       rev32           v28.16b,v28.16b         /* fix endian w2 */
+       rev32           v29.16b,v29.16b         /* fix endian w3 */
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+       cbz             x13, .Lbm2fromQ0        /* skip if none left */
+
+       /*
+        * mode op 0
+        * read next aes block, update aes_ptr_in
+        */
+       ld1             {v0.16b},[x0],16
+       /* base address for sha round consts */
+       adr             x8,.Lrcon
+       ld1             {v4.16b},[x8],16        /* key0 */
+       ld1             {v5.16b},[x8],16        /* key1 */
+       ld1             {v6.16b},[x8],16        /* key2 */
+       ld1             {v7.16b},[x8],16        /* key3  */
+       eor             v0.16b,v0.16b,v3.16b    /* xor w/ prev value */
+
+       /* aes xform 0, sha quad 0 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       aese            v0.16b,v8.16b
+       aesmc           v0.16b,v0.16b
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256su0       v26.4s,v27.4s
+       aese            v0.16b,v9.16b
+       aesmc           v0.16b,v0.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       aese            v0.16b,v10.16b
+       aesmc           v0.16b,v0.16b
+       sha256su1       v26.4s,v28.4s,v29.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256su0       v27.4s,v28.4s
+       aese            v0.16b,v11.16b
+       aesmc           v0.16b,v0.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       aese            v0.16b,v12.16b
+       aesmc           v0.16b,v0.16b
+       sha256su1       v27.4s,v29.4s,v26.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su0       v28.4s,v29.4s
+       aese            v0.16b,v13.16b
+       aesmc           v0.16b,v0.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       aese            v0.16b,v14.16b
+       aesmc           v0.16b,v0.16b
+       sha256su1       v28.4s,v26.4s,v27.4s
+       sha256su0       v29.4s,v26.4s
+       aese            v0.16b,v15.16b
+       aesmc           v0.16b,v0.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       aese            v0.16b,v16.16b
+       aesmc           v0.16b,v0.16b
+       sha256h2        q23, q21, v7.4s
+       sha256su1       v29.4s,v27.4s,v28.4s
+___
+       &aes192_aes256_handle(1, "enc_epilog", 0, 0);
+$code.=<<___;
+       subs            x14,x13,1               /* local copy of aes_blocks_left */
+       /* save aes res, bump aes_out_ptr */
+       st1             {v0.16b},[x1],16
+       /* if aes_blocks_left_count == 0 */
+       beq             .Lbm2fromQ1
+       /*
+        * mode op 1
+        * read next aes block, update aes_ptr_in
+        */
+       ld1             {v1.16b},[x0],16
+       ld1             {v4.16b},[x8],16        /* key4 */
+       ld1             {v5.16b},[x8],16        /* key5 */
+       ld1             {v6.16b},[x8],16        /* key6 */
+       ld1             {v7.16b},[x8],16        /* key7 */
+
+       eor             v1.16b,v1.16b,v0.16b    /* xor w/prev value */
+
+       /* aes xform 1, sha quad 1 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       aese            v1.16b,v8.16b
+       aesmc           v1.16b,v1.16b
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su0       v26.4s,v27.4s
+       aese            v1.16b,v9.16b
+       aesmc           v1.16b,v1.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       aese            v1.16b,v10.16b
+       aesmc           v1.16b,v1.16b
+       sha256su1       v26.4s,v28.4s,v29.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su0       v27.4s,v28.4s
+       aese            v1.16b,v11.16b
+       aesmc           v1.16b,v1.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       aese            v1.16b,v12.16b
+       aesmc           v1.16b,v1.16b
+       sha256su1       v27.4s,v29.4s,v26.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su0       v28.4s,v29.4s
+       aese            v1.16b,v13.16b
+       aesmc           v1.16b,v1.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       aese            v1.16b,v14.16b
+       aesmc           v1.16b,v1.16b
+       sha256su1       v28.4s,v26.4s,v27.4s
+       sha256su0       v29.4s,v26.4s
+       aese            v1.16b,v15.16b
+       aesmc           v1.16b,v1.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       aese            v1.16b,v16.16b
+       aesmc           v1.16b,v1.16b
+       sha256su1       v29.4s,v27.4s,v28.4s
+___
+       &aes192_aes256_handle(1, "enc_epilog", 1, 0);
+$code.=<<___;
+       subs            x14,x14,1               /* dec counter */
+       /* save aes res, bump aes_out_ptr */
+       st1             {v1.16b},[x1],16
+       /* if aes_blocks_left_count == 0 */
+       beq             .Lbm2fromQ2
+       /*
+        * mode op 2
+        * read next aes block, update aes_ptr_in
+        */
+       ld1             {v2.16b},[x0],16
+       ld1             {v4.16b},[x8],16        /* key4 */
+       ld1             {v5.16b},[x8],16        /* key5 */
+       ld1             {v6.16b},[x8],16        /* key6 */
+       ld1             {v7.16b},[x8],16        /* key7 */
+       eor             v2.16b,v2.16b,v1.16b    /* xor w/prev value */
+
+       /* aes xform 2, sha quad 2 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       aese            v2.16b,v8.16b
+       aesmc           v2.16b,v2.16b
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256su0       v26.4s,v27.4s
+       aese            v2.16b,v9.16b
+       aesmc           v2.16b,v2.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       aese            v2.16b,v10.16b
+       aesmc           v2.16b,v2.16b
+       sha256su1       v26.4s,v28.4s,v29.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256su0       v27.4s,v28.4s
+       aese            v2.16b,v11.16b
+       aesmc           v2.16b,v2.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       aese            v2.16b,v12.16b
+       aesmc           v2.16b,v2.16b
+       sha256su1       v27.4s,v29.4s,v26.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su0       v28.4s,v29.4s
+       aese            v2.16b,v13.16b
+       aesmc           v2.16b,v2.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       aese            v2.16b,v14.16b
+       aesmc           v2.16b,v2.16b
+       sha256su1       v28.4s,v26.4s,v27.4s
+       sha256su0       v29.4s,v26.4s
+       aese            v2.16b,v15.16b
+       aesmc           v2.16b,v2.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       aese            v2.16b,v16.16b
+       aesmc           v2.16b,v2.16b
+       sha256h2        q23, q21, v7.4s
+       sha256su1       v29.4s,v27.4s,v28.4s
+___
+       &aes192_aes256_handle(1, "enc_epilog", 2, 0);
+$code.=<<___;
+       /* save aes res, bump aes_out_ptr */
+       st1             {v2.16b},[x1],16
+       /* join common code at Quad 3 */
+       b               .Lbm2fromQ3
+/*
+ * Now there is the b-2 sha block before the final one.  Execution takes over
+ * in the appropriate part of this depending on how many aes blocks were left.
+ * If there were none, the whole thing is executed.
+ */
+/* quad 0 */
+.Lbm2fromQ0:
+       /* base address for sha round consts */
+       adr             x8,.Lrcon
+
+       ld1             {v4.16b},[x8],16        /* key0 */
+       ld1             {v5.16b},[x8],16        /* key1 */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key2 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key3 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+/* quad 1 */
+.Lbm2fromQ1:
+       ld1             {v4.16b},[x8],16        /* key4 */
+       ld1             {v5.16b},[x8],16        /* key5 */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+/* quad 2 */
+.Lbm2fromQ2:
+       ld1             {v4.16b},[x8],16        /* key4 */
+       ld1             {v5.16b},[x8],16        /* key5 */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+/* quad 3 */
+.Lbm2fromQ3:
+       ld1             {v4.16b},[x8],16        /* key4 */
+       ld1             {v5.16b},[x8],16        /* key5 */
+       ld1             {v6.16b},[x8],16        /* key6 */
+       ld1             {v7.16b},[x8],16        /* key7 */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256h2        q23, q21, v4.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       eor             v26.16b,v26.16b,v26.16b /* zero reg */
+       sha256h2        q23, q21, v5.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       eor             v27.16b,v27.16b,v27.16b /* zero reg */
+       sha256h2        q23, q21, v6.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       eor             v28.16b,v28.16b,v28.16b /* zero reg */
+       sha256h2        q23, q21, v7.4s
+
+       add             v24.4s,v24.4s,v22.4s    /* ABCD += working copy */
+       add             v25.4s,v25.4s,v23.4s    /* EFGH += working copy */
+
+       /* Process remaining 0-3 AES blocks here */
+       eor             v29.16b,v29.16b,v29.16b /* zero sha src 3 */
+
+       cbz             x13,.Lpost_long_Q0
+
+       /* 1st remaining AES block */
+       ld1             {v26.16b},[x3],16
+       sub             x5,x5,16
+       rev32           v26.16b,v26.16b
+       subs            x14,x13,1
+       b.eq            .Lpost_long_Q1
+
+       /* 2nd remaining AES block */
+       ld1             {v27.16b},[x3],16
+       sub             x5,x5,16
+       rev32           v27.16b,v27.16b
+       subs            x14,x14,1
+       b.eq            .Lpost_long_Q2
+
+       /* 3rd remaining AES block */
+       ld1             {v28.16b},[x3],16
+       sub             x5,x5,16
+       rev32           v28.16b,v28.16b
+       /* Allow for filling this sha256 block with the remaining digest src */
+       b               .Lpost_long_Q3
+/*
+ * Process remaining 8B blocks of the digest
+ */
+.Lpost_long_Q0:
+/* blk 0,1 */
+       /* assume final block */
+       mov             v26.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_long_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v26 value (0x80) */
+       mov             v26.d[0],x2
+       /* assume this was final block */
+       mov             v26.b[11],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_long_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       mov             v26.d[1],x2
+
+.Lpost_long_Q1:
+/* blk 2,3 */
+       /* assume this is final block */
+       mov             v27.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_long_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v27 value (0x80) */
+       mov             v27.d[0],x2
+       /* assume this was final block */
+       mov             v27.b[11],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_long_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       mov             v27.d[1],x2
+
+.Lpost_long_Q2:
+/* blk 4,5 */
+       /* assume this was final block */
+       mov             v28.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_long_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v28 value (0x80) */
+       mov             v28.d[0],x2
+       /* assume this was final block */
+       mov             v28.b[11],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_long_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       mov             v28.d[1],x2
+
+.Lpost_long_Q3:
+/* blk 6,7 */
+       /* assume this was final block */
+       mov             v29.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_long_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v29 value (0x80) */
+       mov             v29.d[0],x2
+       /* assume this was final block */
+       mov             v29.b[11],w15
+       /*
+        * Outstanding 8B blocks left.
+        * Since there has to be another sha block with padding,
+        * we need to calculate hash without padding here.
+        */
+       cbz             x5,1f
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       rev32           x2,x2
+       /*
+        * Don't decrease x5 here.
+        * Use it to indicate necessity of constructing "1" padding at the end.
+        */
+       mov             v29.d[1],x2
+
+/*
+ * That is enough of blocks, we allow up to 64 bytes in total.
+ * Now we have the sha256 to do for these 4 16B blocks
+ */
+1:
+       /* base address for sha round consts */
+       adr             x8,.Lrcon
+
+       ld1             {v4.16b},[x8],16        /* key0 */
+       ld1             {v5.16b},[x8],16        /* key1 */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+
+       sha256su0       v26.4s,v27.4s
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key2 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key3 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+
+       eor             v26.16b,v26.16b,v26.16b         /* zero sha src 0 */
+       add             v24.4s,v24.4s,v22.4s            /* ABCD += working copy */
+       eor             v27.16b,v27.16b,v27.16b         /* zero sha src 1 */
+       add             v25.4s,v25.4s,v23.4s            /* EFGH += working copy */
+       eor             v28.16b,v28.16b,v28.16b         /* zero sha src 2 */
+       eor             v29.16b,v29.16b,v29.16b         /* zero sha src 3 */
+
+       /* this was final block */
+       cbz             x5,.Lpost_long_loop
+       subs            x5,x5,8
+       /* loop if hash is not finished */
+       b.ne            .Lpost_long_Q0
+       /* set "1" of the padding if this was a final block */
+       mov             v26.b[3],w15
+
+.Lpost_long_loop:
+       /* Add outstanding bytes of digest source */
+       add             x11,x11,x12
+       /* Add one SHA-256 block since hash is calculated including i_key_pad */
+       add             x11,x11, #64
+       lsr             x12,x11,32              /* len_hi */
+       and             x13,x11,0xffffffff      /* len_lo */
+       lsl             x12,x12,3               /* len_hi in bits */
+       lsl             x13,x13,3               /* len_lo in bits */
+
+       mov             v29.s[3],w13            /* len_lo */
+       mov             v29.s[2],w12            /* len_hi */
+
+       /*
+        * do last sha of pad block
+        */
+       /* base address for sha round consts */
+       adr             x8,.Lrcon
+
+       /* quad 0 */
+       ld1             {v4.16b},[x8],16        /* key0 */
+       ld1             {v5.16b},[x8],16        /* key1 */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+
+       sha256su0       v26.4s,v27.4s
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key2 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key3 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 1 */
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 2 */
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 3 */
+       ldp             q6,q7,[x8],32           /* key6,key7 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+
+       sha256h         q22, q23, v4.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256h2        q23, q21, v4.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+
+       add             v26.4s,v24.4s,v22.4s    /* ABCD += working copy */
+       add             v27.4s,v25.4s,v23.4s    /* EFGH += working copy */
+
+       /* Calculate final HMAC */
+       eor             v28.16b, v28.16b, v28.16b
+       eor             v29.16b, v29.16b, v29.16b
+       /* base address for sha round consts */
+       adr             x8,.Lrcon
+       /* load o_key_pad partial hash */
+       ldp             q24,q25,[x7]
+
+       /* Set padding 1 to the first reg */
+       mov             w11, #0x80              /* that's the 1 of the pad */
+       mov             v28.b[3], w11
+       /* size of o_key_pad + inner hash */
+       mov             x11, #64+32
+       lsl             x11, x11, 3
+       /* move length to the end of the block */
+       mov             v29.s[3], w11
+       ldp             q4,q5,[x8],32           /* key0,key1 */
+       lsr             x11, x11, 32
+       mov             v29.s[2], w11           /* and the higher part */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+
+       sha256su0       v26.4s,v27.4s
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key2 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key3 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       ldp             q6,q7,[x8],32           /* key6,key7 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+
+       sha256h         q22, q23, v4.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256h2        q23, q21, v4.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+
+       ldp             d10,d11,[sp,#16]
+       ldp             d12,d13,[sp,#32]
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+
+       ldp             d14,d15,[sp,#48]
+       ldp             d8,d9,[sp],#64
+
+       mov             x0, xzr
+
+       add             v24.4s,v24.4s,v22.4s    /* ABCD += working copy */
+       add             v25.4s,v25.4s,v23.4s    /* EFGH += working copy */
+
+       rev32           v24.16b, v24.16b
+       rev32           v25.16b, v25.16b
+
+       stp             q24,q25,[x4]            /* save them both */
+
+       ret
+
+/*
+ * These are the short cases (less efficient), here used for 1-11 aes blocks.
+ * x10 = aes_blocks
+ */
+.Lenc_short_cases:
+       ld1             {v3.16b},[x6]                   /* get ivec */
+       ldp             q8,q9,[x9],32                   /* rk[0-1] */
+       eor             v26.16b,v26.16b,v26.16b         /* zero sha src 0 */
+       ldp             q10,q11,[x9],32                 /* rk[2-3] */
+       eor             v27.16b,v27.16b,v27.16b         /* zero sha src 1 */
+       ldp             q12,q13,[x9],32                 /* rk[3-4] */
+       eor             v28.16b,v28.16b,v28.16b         /* zero sha src 2 */
+       ldp             q14,q15,[x9],32                 /* rk[5-6] */
+       eor             v29.16b,v29.16b,v29.16b         /* zero sha src 3 */
+       ldp             q16,q17,[x9],32                 /* rk[7-8] */
+       mov             w15,0x80                        /* sha padding word */
+       lsl             x11,x10,4                       /* len = aes_blocks*16 */
+       ld1             {v18.16b},[x9]                  /* rk[9] */
+
+       /* get outstanding bytes of the digest */
+       sub             x12,x5,x2
+/*
+ * the idea in the short loop (at least 1) is to break out with the padding
+ * already in place excepting the final word.
+ */
+.Lenc_short_loop:
+       adr             x8,.Lrcon                       /* rcon */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v0.16b},[x0],16
+       eor             v0.16b,v0.16b,v3.16b            /* xor w/prev value */
+
+       /* aes xform 0 */
+       aese            v0.16b,v8.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v9.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v10.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v11.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v12.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v13.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v14.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v15.16b
+       aesmc           v0.16b,v0.16b
+       aese            v0.16b,v16.16b
+       aesmc           v0.16b,v0.16b
+___
+       &aes192_aes256_handle(1, "enc_short", 0, 1);
+$code.=<<___;
+       /* save aes res, bump aes_out_ptr */
+       st1             {v0.16b},[x1],16
+       /* load next 16 bytes for SHA-256 */
+       ld1             {v26.16b},[x3],16
+       /* dec number of bytes of the hash input */
+       sub             x5,x5,16
+       rev32           v26.16b,v26.16b /* load res to sha 0, endian swap */
+       sub             x10,x10,1               /* dec num_blocks */
+       cbz             x10,.Lpost_short_Q1     /* break if no more */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v1.16b},[x0],16
+       eor             v1.16b,v1.16b,v0.16b    /* xor w/prev value */
+
+       /* aes xform 1 */
+       aese            v1.16b,v8.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v9.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v10.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v11.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v12.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v13.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v14.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v15.16b
+       aesmc           v1.16b,v1.16b
+       aese            v1.16b,v16.16b
+       aesmc           v1.16b,v1.16b
+___
+       &aes192_aes256_handle(1, "enc_short", 1, 0);
+$code.=<<___;
+       /* save aes res, bump aes_out_ptr */
+       st1             {v1.16b},[x1],16
+       /* load next 16 bytes for SHA-256 */
+       ld1             {v27.16b},[x3],16
+       /* dec number of bytes of the hash input */
+       sub             x5,x5,16
+       rev32           v27.16b,v27.16b /* load res to sha 0, endian swap */
+       sub             x10,x10,1               /* dec num_blocks */
+       cbz             x10,.Lpost_short_Q2     /* break if no more */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v2.16b},[x0],16
+       eor             v2.16b,v2.16b,v1.16b    /* xor w/prev value */
+
+       /* aes xform 2 */
+       aese            v2.16b,v8.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v9.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v10.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v11.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v12.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v13.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v14.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v15.16b
+       aesmc           v2.16b,v2.16b
+       aese            v2.16b,v16.16b
+       aesmc           v2.16b,v2.16b
+___
+       &aes192_aes256_handle(1, "enc_short", 2, 0);
+$code.=<<___;
+       /* save aes res, bump aes_out_ptr */
+       st1             {v2.16b},[x1],16
+       /* load next 16 bytes for SHA-256 */
+       ld1             {v28.16b},[x3],16
+       /* dec number of bytes of the hash input */
+       sub             x5,x5,16
+       rev32           v28.16b,v28.16b         /* load res to sha 0, endian swap */
+       sub             x10,x10,1               /* dec num_blocks */
+       cbz             x10,.Lpost_short_Q3     /* break if no more */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v3.16b},[x0],16
+       eor             v3.16b,v3.16b,v2.16b    /* xor w/ prev value */
+
+       /* aes xform 3 */
+       aese            v3.16b,v8.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v9.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v10.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v11.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v12.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v13.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v14.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v15.16b
+       aesmc           v3.16b,v3.16b
+       aese            v3.16b,v16.16b
+       aesmc           v3.16b,v3.16b
+___
+       &aes192_aes256_handle(1, "enc_short", 3, 0);
+$code.=<<___;
+       /* save aes res, bump aes_out_ptr */
+       st1             {v3.16b},[x1],16
+       /* load next 16 bytes for SHA-256 */
+       ld1             {v29.16b},[x3],16
+       /* dec number of bytes of the hash input */
+       sub             x5,x5,16
+       /* load res to sha 0, endian swap */
+       rev32           v29.16b,v29.16b
+       /*
+        * now we have the sha256 to do for these 4 aes blocks
+        */
+
+       /* quad 0 */
+       ld1             {v4.16b},[x8],16        /* key0 */
+       ld1             {v5.16b},[x8],16        /* key1 */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+
+       sha256su0       v26.4s,v27.4s
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key2 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key3 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 1 */
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 2 */
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 3 */
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+
+       eor             v26.16b,v26.16b,v26.16b         /* zero sha src 0 */
+       add             v24.4s,v24.4s,v22.4s            /* ABCD += working copy */
+       eor             v27.16b,v27.16b,v27.16b         /* zero sha src 1 */
+       add             v25.4s,v25.4s,v23.4s            /* EFGH += working copy */
+       eor             v28.16b,v28.16b,v28.16b         /* zero sha src 2 */
+       sub             x10,x10,1                       /* dec num_blocks */
+       eor             v29.16b,v29.16b,v29.16b         /* zero sha src 3 */
+
+       cbnz            x10,.Lenc_short_loop            /* keep looping if more */
+
+.Lpost_short_Q0:
+       /* assume this was final block */
+       mov             v26.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_short_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v26 value (0x80) */
+       mov             v26.d[0],x2
+       /* assume this was final block */
+       mov             v26.b[11],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_short_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       mov             v26.d[1],x2
+.Lpost_short_Q1:
+       /* zero out vectors */
+       eor             v27.16b,v27.16b,v27.16b
+       eor             v28.16b,v28.16b,v28.16b
+       eor             v29.16b,v29.16b,v29.16b
+       /* assume this is final block */
+       mov             v27.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_short_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v27 value (0x80) */
+       mov             v27.d[0],x2
+       /* assume this was final block */
+       mov             v27.b[11],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_short_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       mov             v27.d[1],x2
+.Lpost_short_Q2:
+       /* zero out vectors (repeated if came from Q0) */
+       eor             v28.16b,v28.16b,v28.16b
+       eor             v29.16b,v29.16b,v29.16b
+       /* assume this was final block */
+       mov             v28.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_short_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v28 value (0x80) */
+       mov             v28.d[0],x2
+       /* assume this was final block */
+       mov             v28.b[11],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_short_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       mov             v28.d[1],x2
+.Lpost_short_Q3:
+       /* zero out vector (repeated if came from Q1) */
+       eor             v29.16b,v29.16b,v29.16b
+       /* assume this was final block */
+       mov             v29.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_short_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v29 value (0x80) */
+       mov             v29.d[0],x2
+       /* assume this was final block */
+       mov             v29.b[11],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,1f
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       rev32           x2,x2
+       mov             v29.d[1],x2
+
+/*
+ * That is enough of blocks, we allow up to 64 bytes in total.
+ * Now we have the sha256 to do for these 4 16B blocks
+ */
+1:
+       /* base address for sha round consts */
+       adr             x8,.Lrcon
+
+       ld1             {v4.16b},[x8],16        /* key0 */
+       ld1             {v5.16b},[x8],16        /* key1 */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+
+       sha256su0       v26.4s,v27.4s
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key2 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key3 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+
+       eor             v26.16b,v26.16b,v26.16b         /* zero sha src 0 */
+       add             v24.4s,v24.4s,v22.4s            /* ABCD += working copy */
+       eor             v27.16b,v27.16b,v27.16b         /* zero sha src 1 */
+       add             v25.4s,v25.4s,v23.4s            /* EFGH += working copy */
+       eor             v28.16b,v28.16b,v28.16b         /* zero sha src 2 */
+       eor             v29.16b,v29.16b,v29.16b         /* zero sha src 3 */
+
+       /* this was final block */
+       cbz             x5,.Lpost_short_loop
+       subs            x5,x5,8
+       /* loop if hash is not finished */
+       b.ne            .Lpost_short_Q0
+       /* set "1" of the padding if this was a final block */
+       mov             v26.b[3],w15
+
+/*
+ * there are between 0 and 3 aes blocks in the final sha256 blocks
+ */
+.Lpost_short_loop:
+       /* Add outstanding bytes of digest source */
+       add             x11,x11,x12
+       /* Add one SHA-256 block since hash is calculated including i_key_pad */
+       add             x11,x11, #64
+       lsr             x12,x11,32              /* len_hi */
+       and             x13,x11,0xffffffff      /* len_lo */
+       lsl             x12,x12,3               /* len_hi in bits */
+       lsl             x13,x13,3               /* len_lo in bits */
+
+       mov             v29.s[3],w13            /* len_lo */
+       mov             v29.s[2],w12            /* len_hi */
+
+       /* do final block */
+
+       /* base address for sha round consts */
+       adr             x8,.Lrcon               /* top of rcon */
+
+       /* quad 0 */
+       ld1             {v4.16b},[x8],16        /* key0 */
+       ld1             {v5.16b},[x8],16        /* key1 */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+
+       sha256su0       v26.4s,v27.4s
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key2 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key3 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 1 */
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 2 */
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 3 */
+       ldp             q6,q7,[x8],32           /* key6,key7 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+
+       sha256h         q22, q23, v4.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256h2        q23, q21, v4.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+
+       add             v26.4s,v24.4s,v22.4s    /* ABCD += working copy */
+       add             v27.4s,v25.4s,v23.4s    /* EFGH += working copy */
+
+       /* Calculate final HMAC */
+       eor             v28.16b, v28.16b, v28.16b
+       eor             v29.16b, v29.16b, v29.16b
+       /* base address for sha round consts */
+       adr             x8,.Lrcon
+       /* load o_key_pad partial hash */
+       ldp             q24,q25,[x7]
+
+       /* Set padding 1 to the first reg */
+       mov             w11, #0x80              /* that's the 1 of the pad */
+       mov             v28.b[3], w11
+       /* size of o_key_pad + inner hash */
+       mov             x11, #64+32
+       lsl             x11, x11, 3
+       /* move length to the end of the block */
+       mov             v29.s[3], w11
+       ldp             q4,q5,[x8],32           /* key0,key1 */
+       lsr             x11, x11, 32
+       mov             v29.s[2], w11           /* and the higher part */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+
+       sha256su0       v26.4s,v27.4s
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key2 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key3 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       ldp             q6,q7,[x8],32           /* key6,key7 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+
+       sha256h         q22, q23, v4.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256h2        q23, q21, v4.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+
+       ldp             d10,d11,[sp,#16]
+       ldp             d12,d13,[sp,#32]
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+
+       ldp             d14,d15,[sp,#48]
+       ldp             d8,d9,[sp],#64
+
+       mov             x0, xzr
+
+       add             v24.4s,v24.4s,v22.4s    /* ABCD += working copy */
+       add             v25.4s,v25.4s,v23.4s    /* EFGH += working copy */
+
+       rev32           v24.16b, v24.16b
+       rev32           v25.16b, v25.16b
+
+       stp             q24,q25,[x4]            /* save them both */
+
+       ret
+
+.size  asm_aescbc_sha256_hmac, .-asm_aescbc_sha256_hmac
+
+# Description:
+#
+# Combined Auth/Dec Primitive = sha256_hmac/aes128cbc
+#
+# Operations:
+#
+# out = decrypt-AES128CBC(in)
+# return_ash_ptr = SHA256(o_key_pad | SHA256(i_key_pad | in))
+#
+# Prototype:
+#
+# void asm_sha256_hmac_aescbc_dec(uint8_t *csrc, uint8_t *cdst, uint64_t clen,
+#                      uint8_t *dsrc, uint8_t *ddst, uint64_t dlen,
+#                      CIPH_DIGEST *arg)
+#
+# Registers used:
+#
+# asm_sha256_hmac_aescbc_dec(
+#      csrc,   x0      (cipher src address)
+#      cdst,   x1      (cipher dst address)
+#      clen    x2      (cipher length)
+#      dsrc,   x3      (digest src address)
+#      ddst,   x4      (digest dst address)
+#      dlen,   x5      (digest length)
+#      arg     x6:
+#              arg->cipher.key                 (round keys)
+#              arg->cipher.key_rounds          (key rounds)
+#              arg->cipher.iv                  (initialization vector)
+#              arg->digest.hmac.i_key_pad      (partially hashed i_key_pad)
+#              arg->digest.hmac.o_key_pad      (partially hashed o_key_pad)
+#      )
+#
+# Routine register definitions:
+#
+# v0 - v3 -- aes results
+# v4 - v7 -- round consts for sha
+# v8 - v18 -- round keys
+# v19 - v20 -- round keys
+# v21 -- ABCD tmp
+# v22 -- sha working state ABCD (q22)
+# v23 -- sha working state EFGH (q23)
+# v24 -- sha state ABCD
+# v25 -- sha state EFGH
+# v26 -- sha block 0
+# v27 -- sha block 1
+# v28 -- sha block 2
+# v29 -- sha block 3
+# v30 -- reserved
+# v31 -- reserved
+#
+#
+# Constraints:
+#
+# The variable "clen" must be a multiple of 16, otherwise results are not
+# defined For AES partial blocks the user is required to pad the input to
+# modulus 16 = 0.
+#
+# The variable "dlen" must be a multiple of 8 and greater or equal to "clen".
+# The maximum difference between "dlen" and "clen" cannot exceed 64 bytes.
+# This constrain is strictly related to the needs of the IPSec ESP packet.
+# Short lengths are less optimized at < 16 AES blocks, however they are
+# somewhat optimized, and more so than the enc/auth versions.
+
+.global        asm_sha256_hmac_aescbc_dec
+.type  asm_sha256_hmac_aescbc_dec,%function
+
+asm_sha256_hmac_aescbc_dec:
+       AARCH64_VALID_CALL_TARGET
+       /* protect registers */
+       stp             d8,d9,[sp, #-80]!
+
+       /* fetch args */
+       ldr             x7, [x6, #HMAC_IKEYPAD]
+       /* init ABCD, EFGH */
+       ldp             q24,q25,[x7]
+       /* save pointer to o_key_pad partial hash */
+       ldr             x7, [x6, #HMAC_OKEYPAD]
+
+       stp             d10,d11,[sp,#16]
+
+       prfm            PLDL1KEEP,[x0,0]        /* pref next aes_ptr_in */
+       stp             d12,d13,[sp,#32]
+       prfm            PLDL1KEEP,[x1,0]        /* pref next aes_ptr_out */
+       lsr             x10,x2,4                /* aes_blocks = len/16 */
+       stp             d14,d15,[sp,#48]
+       /* address of sha init state consts */
+       adr             x12,.Linit_sha_state
+       stp             x19,x20,[sp,#64]
+
+       ldr             x9, [x6, #CIPHER_KEY]
+       ldr             x16, [x6, #CIPHER_KEY_ROUNDS]
+       ldr             x6, [x6, #CIPHER_IV]
+       add             x17, x9, #160           /* point to the last 5 rounds keys */
+       /*
+        * Init sha state, prefetch, check for small cases.
+        * Note that the output is prefetched as a load, for the in-place case.
+        */
+       cmp             x10,16                  /* no main loop if <16 */
+       blt             .Ldec_short_cases       /* branch if < 12 */
+
+       /* get outstanding bytes of the digest */
+       sub             x20,x5,x2
+
+       mov             x11,x2                  /* len -> x11 needed at end */
+       ld1             {v30.16b},[x6]          /* get 1st ivec */
+       lsr             x12,x11,6               /* total_blocks (sha) */
+
+       ldp             q26,q27,[x3],32
+       rev32           v26.16b,v26.16b         /* endian swap w0 */
+       rev32           v27.16b,v27.16b         /* endian swap w1 */
+       ldp             q28,q29,[x3],32
+       rev32           v28.16b,v28.16b         /* endian swap w2 */
+       rev32           v29.16b,v29.16b         /* endian swap w3 */
+
+       /* substract loaded bytes */
+       sub             x5,x5,64
+       /*
+        * now we can do the loop prolog, 1st sha256 block
+        */
+       prfm            PLDL1KEEP,[x0,64]       /* pref next aes_ptr_in */
+       prfm            PLDL1KEEP,[x1,64]       /* pref next aes_ptr_out */
+       /* base address for sha round consts */
+       adr             x8,.Lrcon
+       /*
+        * do the first sha256 block on the plaintext
+        */
+       mov             v22.16b,v24.16b         /* init working ABCD */
+       mov             v23.16b,v25.16b         /* init working EFGH */
+
+       /* quad 0 */
+       ld1             {v4.16b},[x8],16        /* key0 */
+       ld1             {v5.16b},[x8],16        /* key1 */
+       ld1             {v6.16b},[x8],16        /* key2 */
+       ld1             {v7.16b},[x8],16        /* key3 */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v8.16b},[x9],16        /* rk[0] */
+       sha256h2        q23, q21, v4.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+       ld1             {v9.16b},[x9],16        /* rk[1] */
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v10.16b},[x9],16       /* rk[2] */
+       sha256h2        q23, q21, v5.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256h2        q23, q21, v6.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+       ld1             {v11.16b},[x9],16       /* rk[3] */
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256h2        q23, q21, v7.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 1 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v12.16b},[x9],16       /* rk[4] */
+       sha256h2        q23, q21, v4.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+       ld1             {v13.16b},[x9],16       /* rk[5] */
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v14.16b},[x9],16       /* rk[6] */
+       sha256h2        q23, q21, v5.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256h2        q23, q21, v6.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+       ld1             {v15.16b},[x9],16       /* rk[7] */
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256h2        q23, q21, v7.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 2 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v16.16b},[x9],16       /* rk[8] */
+       sha256h2        q23, q21, v4.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+       ld1             {v17.16b},[x9],16       /* rk[9] */
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v18.16b},[x9],16       /* rk[10] */
+       sha256h2        q23, q21, v5.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256h2        q23, q21, v6.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256h2        q23, q21, v7.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 3 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256h2        q23, q21, v4.4s
+       ld1             {v26.16b},[x3],16       /* next w0 */
+       ld1             {v27.16b},[x3],16       /* next w1 */
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256h2        q23, q21, v5.4s
+       ld1             {v28.16b},[x3],16       /* next w2 */
+       ld1             {v29.16b},[x3],16       /* next w3 */
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+
+       /* substract loaded bytes */
+       sub             x5,x5,64
+
+       /*
+        * aes_blocks_left := number after the main (sha) block is done.
+        * can be 0 note we account for the extra unwind in main_blocks
+        */
+       sub             x15,x12,2               /* main_blocks=total_blocks-5 */
+
+       add             v24.4s,v24.4s,v22.4s    /* ABCD += working copy */
+       and             x13,x10,3               /* aes_blocks_left */
+       ld1             {v0.16b},[x0]           /* next aes block, no update */
+       add             v25.4s,v25.4s,v23.4s    /* EFGH += working copy */
+       add             x9,x0,128               /* lead_ptr = *in */
+       /* next aes block, update aes_ptr_in */
+       ld1             {v31.16b},[x0],16
+
+       /* indicate AES blocks to write back */
+       mov             x19,xzr
+/*
+ * main combined loop CBC, can be used by auth/enc version
+ */
+.Ldec_main_loop:
+       /*
+        * Because both mov, rev32 and eor have a busy cycle, this takes longer
+        * than it looks.
+        */
+       rev32           v26.16b,v26.16b         /* fix endian w0 */
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       prfm            PLDL1KEEP,[x9,64]       /* pref next lead_ptr */
+       rev32           v27.16b,v27.16b         /* fix endian w1 */
+       /* pref next aes_ptr_out, streaming */
+       prfm            PLDL1KEEP,[x1,64]
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+       /* base address for sha round consts */
+       adr             x8,.Lrcon
+
+       /*
+        * aes xform 0, sha quad 0
+        */
+       aesd            v0.16b,v8.16b
+       aesimc          v0.16b,v0.16b
+       ld1             {v4.16b},[x8],16        /* key0 */
+       rev32           v28.16b,v28.16b         /* fix endian w2 */
+
+       aesd            v0.16b,v9.16b
+       aesimc          v0.16b,v0.16b
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       ld1             {v5.16b},[x8],16        /* key1 */
+       sha256su0       v26.4s,v27.4s
+       aesd            v0.16b,v10.16b
+       aesimc          v0.16b,v0.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       aesd            v0.16b,v11.16b
+       aesimc          v0.16b,v0.16b
+       ld1             {v6.16b},[x8],16        /* key2 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       ld1             {v7.16b},[x8],16        /* key3 */
+       rev32           v29.16b,v29.16b         /* fix endian w3 */
+       /* read next aes block, no update */
+       ld1             {v1.16b},[x0]
+       sha256h2        q23, q21, v4.4s
+       aesd            v0.16b,v12.16b
+       aesimc          v0.16b,v0.16b
+       sha256su1       v26.4s,v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       ld1             {v4.16b},[x8],16        /* key4 */
+       sha256su0       v27.4s,v28.4s
+       aesd            v0.16b,v13.16b
+       aesimc          v0.16b,v0.16b
+       sha256h         q22, q23, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256h2        q23, q21, v5.4s
+       aesd            v0.16b,v14.16b
+       aesimc          v0.16b,v0.16b
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+       sha256su0       v28.4s,v29.4s
+       aesd            v0.16b,v15.16b
+       aesimc          v0.16b,v0.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       aesd            v0.16b,v16.16b
+       aesimc          v0.16b,v0.16b
+       sha256su1       v28.4s,v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256su0       v29.4s,v26.4s
+       sha256h         q22, q23, v7.4s
+___
+       &aes192_aes256_dec_handle(1,"dec_mainloop",0,0);
+$code.=<<___;
+       sha256h2        q23, q21, v7.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       eor             v0.16b,v0.16b,v30.16b   /* xor w/ prev value */
+       /* get next aes block, with update */
+       ld1             {v30.16b},[x0],16
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+
+       /* aes xform 1, sha quad 1 */
+       sha256su0       v26.4s,v27.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       /* save aes res, bump aes_out_ptr */
+       st1             {v0.16b},[x1],16
+       aesd            v1.16b,v8.16b
+       aesimc          v1.16b,v1.16b
+       sha256h         q22, q23, v4.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256h2        q23, q21, v4.4s
+       sha256su1       v26.4s,v28.4s,v29.4s
+       aesd            v1.16b,v9.16b
+       aesimc          v1.16b,v1.16b
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       aesd            v1.16b,v10.16b
+       aesimc          v1.16b,v1.16b
+       /* read next aes block, no update */
+       ld1             {v2.16b},[x0]
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       aesd            v1.16b,v11.16b
+       aesimc          v1.16b,v1.16b
+       ld1             {v5.16b},[x8],16        /* key5 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256su0       v28.4s,v29.4s
+       sha256h         q22, q23, v6.4s
+       aesd            v1.16b,v12.16b
+       aesimc          v1.16b,v1.16b
+       sha256h2        q23, q21, v6.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+       sha256su0       v29.4s,v26.4s
+       aesd            v1.16b,v13.16b
+       aesimc          v1.16b,v1.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       aesd            v1.16b,v14.16b
+       aesimc          v1.16b,v1.16b
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+       aesd            v1.16b,v15.16b
+       aesimc          v1.16b,v1.16b
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       aesd            v1.16b,v16.16b
+       aesimc          v1.16b,v1.16b
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+___
+       &aes192_aes256_dec_handle(1,"dec_mainloop",1,0);
+$code.=<<___;
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       eor             v1.16b,v1.16b,v31.16b   /* mode op 1 xor w/prev value */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v31.16b},[x0],16
+
+       /* aes xform 2, sha quad 2 */
+       sha256su0       v26.4s,v27.4s
+       aesd            v2.16b,v8.16b
+       aesimc          v2.16b,v2.16b
+       /* save aes res, bump aes_out_ptr */
+       st1             {v1.16b},[x1],16
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       aesd            v2.16b,v9.16b
+       aesimc          v2.16b,v2.16b
+       sha256su1       v26.4s,v28.4s,v29.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       sha256su0       v27.4s,v28.4s
+       aesd            v2.16b,v10.16b
+       aesimc          v2.16b,v2.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       aesd            v2.16b,v11.16b
+       aesimc          v2.16b,v2.16b
+       sha256su1       v27.4s,v29.4s,v26.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       sha256su0       v28.4s,v29.4s
+       aesd            v2.16b,v12.16b
+       aesimc          v2.16b,v2.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       aesd            v2.16b,v13.16b
+       aesimc          v2.16b,v2.16b
+       sha256su1       v28.4s,v26.4s,v27.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256su0       v29.4s,v26.4s
+       /* read next aes block, no update */
+       ld1             {v3.16b},[x0]
+       aesd            v2.16b,v14.16b
+       aesimc          v2.16b,v2.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       aesd            v2.16b,v15.16b
+       aesimc          v2.16b,v2.16b
+       sha256su1       v29.4s,v27.4s,v28.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       ld1             {v6.16b},[x8],16        /* key6 */
+       ld1             {v7.16b},[x8],16        /* key7 */
+       aesd            v2.16b,v16.16b
+       aesimc          v2.16b,v2.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_mainloop",2,0);
+$code.=<<___;
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       eor             v2.16b,v2.16b,v30.16b   /* mode of 2 xor w/prev value */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v30.16b},[x0],16
+
+       /* aes xform 3, sha quad 3 (hash only) */
+       aesd            v3.16b,v8.16b
+       aesimc          v3.16b,v3.16b
+       /* save aes res, bump aes_out_ptr */
+       st1             {v2.16b},[x1],16
+       aesd            v3.16b,v9.16b
+       aesimc          v3.16b,v3.16b
+       ld1             {v26.16b},[x3],16       /* next w0 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       aesd            v3.16b,v10.16b
+       aesimc          v3.16b,v3.16b
+       ld1             {v27.16b},[x3],16       /* next w1 */
+       aesd            v3.16b,v11.16b
+       aesimc          v3.16b,v3.16b
+       ld1             {v28.16b},[x3],16       /* next w2 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       aesd            v3.16b,v12.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v13.16b
+       aesimc          v3.16b,v3.16b
+       ld1             {v29.16b},[x3],16       /* next w3 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       aesd            v3.16b,v14.16b
+       aesimc          v3.16b,v3.16b
+       sub             x15,x15,1               /* dec block count */
+       aesd            v3.16b,v15.16b
+       aesimc          v3.16b,v3.16b
+       ld1             {v0.16b},[x0]           /* next aes block, no update */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       aesd            v3.16b,v16.16b
+       aesimc          v3.16b,v3.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_mainloop",3,0);
+$code.=<<___;
+       add             v24.4s,v24.4s,v22.4s    /* ABCD += working copy */
+       eor             v3.16b,v3.16b,v31.16b   /* xor w/ prev value */
+       /* next aes block, update aes_ptr_in */
+       ld1             {v31.16b},[x0],16
+       add             v25.4s,v25.4s,v23.4s    /* EFGH += working copy */
+       /* save aes res, bump aes_out_ptr */
+       st1             {v3.16b},[x1],16
+       /* substract loaded bytes */
+       sub             x5,x5,64
+       cbnz            x15,.Ldec_main_loop     /* loop if more to do */
+       /*
+        * Now the loop epilog. Since the reads for sha have already been done
+        * in advance, we have to have an extra unwind.
+        * This is why the test for the short cases is 16 and not 12.
+        *
+        * The unwind, which is just the main loop without the tests or final reads.
+        */
+
+       rev32           v26.16b,v26.16b         /* fix endian w0 */
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       rev32           v27.16b,v27.16b         /* fix endian w1 */
+       /* pref next aes_ptr_out, streaming */
+       prfm            PLDL1KEEP,[x1,64]
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+       /* base address for sha round consts */
+       adr             x8,.Lrcon
+       ld1             {v4.16b},[x8],16        /* key0 */
+       ld1             {v5.16b},[x8],16        /* key1 */
+
+       /*
+        * aes xform 0, sha quad 0
+        */
+       aesd            v0.16b,v8.16b
+       aesimc          v0.16b,v0.16b
+       ld1             {v6.16b},[x8],16        /* key2 */
+       rev32           v28.16b,v28.16b         /* fix endian w2 */
+       ld1             {v7.16b},[x8],16        /* key3 */
+       /* read next aes block, no update */
+       ld1             {v1.16b},[x0]
+       aesd            v0.16b,v9.16b
+       aesimc          v0.16b,v0.16b
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256su0       v26.4s,v27.4s
+       aesd            v0.16b,v10.16b
+       aesimc          v0.16b,v0.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       aesd            v0.16b,v11.16b
+       aesimc          v0.16b,v0.16b
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       rev32           v29.16b,v29.16b         /* fix endian w3 */
+       sha256h2        q23, q21, v4.4s
+       aesd            v0.16b,v12.16b
+       aesimc          v0.16b,v0.16b
+       sha256su1       v26.4s,v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       ld1             {v4.16b},[x8],16        /* key4 */
+       sha256su0       v27.4s,v28.4s
+       aesd            v0.16b,v13.16b
+       aesimc          v0.16b,v0.16b
+       sha256h         q22, q23, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256h2        q23, q21, v5.4s
+       aesd            v0.16b,v14.16b
+       aesimc          v0.16b,v0.16b
+       ld1             {v5.16b},[x8],16        /* key5 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+       sha256su0       v28.4s,v29.4s
+       aesd            v0.16b,v15.16b
+       aesimc          v0.16b,v0.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       aesd            v0.16b,v16.16b
+       aesimc          v0.16b,v0.16b
+       sha256su1       v28.4s,v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256su0       v29.4s,v26.4s
+       sha256h         q22, q23, v7.4s
+___
+       &aes192_aes256_dec_handle(1,"dec_epilog",0,0);
+$code.=<<___;
+       sha256h2        q23, q21, v7.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       eor             v0.16b,v0.16b,v30.16b   /* xor w/ prev value */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v30.16b},[x0],16
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+
+       /* aes xform 1, sha quad 1 */
+       sha256su0       v26.4s,v27.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       /* save aes res, bump aes_out_ptr */
+       st1             {v0.16b},[x1],16
+       aesd            v1.16b,v8.16b
+       aesimc          v1.16b,v1.16b
+       sha256h         q22, q23, v4.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256h2        q23, q21, v4.4s
+       sha256su1       v26.4s,v28.4s,v29.4s
+       aesd            v1.16b,v9.16b
+       aesimc          v1.16b,v1.16b
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       aesd            v1.16b,v10.16b
+       aesimc          v1.16b,v1.16b
+       /* read next aes block, no update */
+       ld1             {v2.16b},[x0]
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       ld1             {v4.16b},[x8],16        /* key4 */
+       aesd            v1.16b,v11.16b
+       aesimc          v1.16b,v1.16b
+       ld1             {v5.16b},[x8],16        /* key5 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256su0       v28.4s,v29.4s
+       sha256h         q22, q23, v6.4s
+       aesd            v1.16b,v12.16b
+       aesimc          v1.16b,v1.16b
+       sha256h2        q23, q21, v6.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+       sha256su0       v29.4s,v26.4s
+       aesd            v1.16b,v13.16b
+       aesimc          v1.16b,v1.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       aesd            v1.16b,v14.16b
+       aesimc          v1.16b,v1.16b
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+       aesd            v1.16b,v15.16b
+       aesimc          v1.16b,v1.16b
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       aesd            v1.16b,v16.16b
+       aesimc          v1.16b,v1.16b
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+___
+       &aes192_aes256_dec_handle(1,"dec_epilog",1,0);
+$code.=<<___;
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       eor             v1.16b,v1.16b,v31.16b   /* mode op 1 xor w/prev value */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v31.16b},[x0],16
+
+       /* mode op 2 */
+
+       /* aes xform 2, sha quad 2 */
+       sha256su0       v26.4s,v27.4s
+       aesd            v2.16b,v8.16b
+       aesimc          v2.16b,v2.16b
+       /* save aes res, bump aes_out_ptr */
+       st1             {v1.16b},[x1],16
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       aesd            v2.16b,v9.16b
+       aesimc          v2.16b,v2.16b
+       sha256su1       v26.4s,v28.4s,v29.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       sha256su0       v27.4s,v28.4s
+       aesd            v2.16b,v10.16b
+       aesimc          v2.16b,v2.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       aesd            v2.16b,v11.16b
+       aesimc          v2.16b,v2.16b
+       sha256su1       v27.4s,v29.4s,v26.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       sha256su0       v28.4s,v29.4s
+       aesd            v2.16b,v12.16b
+       aesimc          v2.16b,v2.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       aesd            v2.16b,v13.16b
+       aesimc          v2.16b,v2.16b
+       sha256su1       v28.4s,v26.4s,v27.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256su0       v29.4s,v26.4s
+       /* read next aes block, no update */
+       ld1             {v3.16b},[x0]
+       aesd            v2.16b,v14.16b
+       aesimc          v2.16b,v2.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       aesd            v2.16b,v15.16b
+       aesimc          v2.16b,v2.16b
+       sha256su1       v29.4s,v27.4s,v28.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       ld1             {v6.16b},[x8],16        /* key6 */
+       ld1             {v7.16b},[x8],16        /* key7 */
+       aesd            v2.16b,v16.16b
+       aesimc          v2.16b,v2.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_epilog",2,0);
+$code.=<<___;
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       eor             v2.16b,v2.16b,v30.16b   /* mode of 2 xor w/prev value */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v30.16b},[x0],16
+
+       /* mode op 3 */
+
+       /* aes xform 3, sha quad 3 (hash only) */
+       aesd            v3.16b,v8.16b
+       aesimc          v3.16b,v3.16b
+       /* save aes res, bump aes_out_ptr */
+       st1             {v2.16b},[x1],16
+       aesd            v3.16b,v9.16b
+       aesimc          v3.16b,v3.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       aesd            v3.16b,v10.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v11.16b
+       aesimc          v3.16b,v3.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       aesd            v3.16b,v12.16b
+       aesimc          v3.16b,v3.16b
+       /* read first aes block, no bump */
+       ld1             {v0.16b},[x0]
+       aesd            v3.16b,v13.16b
+       aesimc          v3.16b,v3.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       aesd            v3.16b,v14.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v15.16b
+       aesimc          v3.16b,v3.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       aesd            v3.16b,v16.16b
+       aesimc          v3.16b,v3.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_epilog",3,0);
+$code.=<<___;
+       add             v24.4s,v24.4s,v22.4s    /* ABCD += working copy */
+       add             v25.4s,v25.4s,v23.4s    /* EFGH += working copy */
+       eor             v3.16b,v3.16b,v31.16b   /* xor w/prev value */
+       /* read first aes block, bump aes_ptr_in */
+       ld1             {v31.16b},[x0],16
+
+
+       /*
+        * now we have to do the 4 aes blocks (b-2) that catch up to where sha is
+        */
+
+       /* aes xform 0 */
+       aesd            v0.16b,v8.16b
+       aesimc          v0.16b,v0.16b
+       /* save aes res, bump aes_out_ptr */
+       st1             {v3.16b},[x1],16
+       aesd            v0.16b,v9.16b
+       aesimc          v0.16b,v0.16b
+       /* read next aes block, no update */
+       ld1             {v1.16b},[x0]
+       aesd            v0.16b,v10.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v11.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v12.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v13.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v14.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v15.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v16.16b
+       aesimc          v0.16b,v0.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_catchup",0,0);
+$code.=<<___;
+       eor             v0.16b,v0.16b,v30.16b   /* xor w/ ivec (modeop) */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v30.16b},[x0],16
+
+       /* aes xform 1 */
+       /* read next aes block, no update */
+       ld1             {v2.16b},[x0]
+       aesd            v1.16b,v8.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v9.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v10.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v11.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v12.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v13.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v14.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v15.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v16.16b
+       aesimc          v1.16b,v1.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_catchup",1,0);
+$code.=<<___;
+       eor             v1.16b,v1.16b,v31.16b   /* xor w/ ivec (modeop) */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v31.16b},[x0],16
+
+       /* aes xform 2 */
+       aesd            v2.16b,v8.16b
+       aesimc          v2.16b,v2.16b
+       /* read next aes block, no update */
+       ld1             {v3.16b},[x0]
+       aesd            v2.16b,v9.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v10.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v11.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v12.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v13.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v14.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v15.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v16.16b
+       aesimc          v2.16b,v2.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_catchup",2,0);
+$code.=<<___;
+       eor             v2.16b,v2.16b,v30.16b   /* xor w/ ivec (modeop) */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v30.16b},[x0],16
+
+       /* aes xform 3 */
+       aesd            v3.16b,v8.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v9.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v10.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v11.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v12.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v13.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v14.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v15.16b
+       aesimc          v3.16b,v3.16b
+       eor             v26.16b,v26.16b,v26.16b
+       eor             v27.16b,v27.16b,v27.16b
+       aesd            v3.16b,v16.16b
+       aesimc          v3.16b,v3.16b
+       eor             v28.16b,v28.16b,v28.16b
+       eor             v29.16b,v29.16b,v29.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_catchup",3,0);
+$code.=<<___;
+       eor             v3.16b,v3.16b,v31.16b   /* xor w/ ivec (modeop) */
+
+       add             x19,x19,4
+/*
+ * Now, there is the final b-1 sha256 padded block.
+ * This contains between 0-3 aes blocks. We take some pains to avoid read spill
+ * by only reading the blocks that are actually defined.
+ * This is also the final sha block code for the shortCases.
+ */
+.Ljoin_common:
+       /* base address for sha round consts */
+       adr             x8,.Lrcon
+       mov             w15,0x80        /* that's the 1 of the pad */
+.Lpost_loop_Q0:
+       /* assume this was final block */
+       mov             v26.b[0],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       /* overwrite previous v26 value (0x80) */
+       mov             v26.d[0],x2
+       /* assume this was final block */
+       mov             v26.b[8],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       mov             v26.d[1],x2
+.Lpost_loop_Q1:
+       /* assume this is final block */
+       mov             v27.b[0],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       /* overwrite previous v27 value (0x80) */
+       mov             v27.d[0],x2
+       /* assume this was final block */
+       mov             v27.b[8],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       mov             v27.d[1],x2
+.Lpost_loop_Q2:
+       /* assume this was final block */
+       mov             v28.b[0],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       /* overwrite previous v28 value (0x80) */
+       mov             v28.d[0],x2
+       /* assume this was final block */
+       mov             v28.b[8],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       mov             v28.d[1],x2
+.Lpost_loop_Q3:
+       /* assume this was final block */
+       mov             v29.b[3],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,.Lpost_loop
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       sub             x5,x5,8
+       rev32           x2,x2
+       /* overwrite previous v29 value (0x80) */
+       mov             v29.d[0],x2
+       /* assume this was final block */
+       mov             v29.b[11],w15
+       /* outstanding 8B blocks left */
+       cbz             x5,1f
+       /* at least 8B left to go, it is safe to fetch this data */
+       ldr             x2,[x3],8
+       rev32           x2,x2
+       mov             v29.d[1],x2
+
+/*
+ * That is enough of blocks, we allow up to 64 bytes in total.
+ * Now we have the sha256 to do for these 4 16B blocks.
+ */
+1:
+       mov             x9,x8
+       rev32           v26.16b,v26.16b
+       ld1             {v4.16b},[x9],16        /* key0 */
+       rev32           v27.16b,v27.16b
+       rev32           v28.16b,v28.16b
+       ld1             {v5.16b},[x9],16        /* key1 */
+       //rev32         v29.16b,v29.16b
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+
+       sha256su0       v26.4s,v27.4s
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x9],16        /* key2 */
+       sha256h2        q23, q21, v4.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x9],16        /* key3 */
+       sha256h2        q23, q21, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       ld1             {v4.16b},[x9],16        /* key4 */
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       ld1             {v5.16b},[x9],16        /* key5 */
+       sha256h2        q23, q21, v7.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 1 */
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x9],16        /* key6 */
+       sha256h2        q23, q21, v4.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x9],16        /* key7 */
+       sha256h2        q23, q21, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       ld1             {v4.16b},[x9],16        /* key4 */
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x9],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 2 */
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x9],16        /* key6 */
+       sha256h2        q23, q21, v4.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x9],16        /* key7 */
+       sha256h2        q23, q21, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       ld1             {v4.16b},[x9],16        /* key4 */
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       ld1             {v5.16b},[x9],16        /* key5 */
+       sha256h2        q23, q21, v7.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 3 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x9],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       eor             v26.16b,v26.16b,v26.16b /* zero sha src 0 */
+       sha256h2        q23, q21, v4.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x9],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       eor             v27.16b,v27.16b,v27.16b /* zero sha src 1 */
+       sha256h2        q23, q21, v5.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       eor             v28.16b,v28.16b,v28.16b /* zero sha src 2 */
+       sha256h2        q23, q21, v6.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       eor             v29.16b,v29.16b,v29.16b /* zero sha src 3 */
+       sha256h2        q23, q21, v7.4s
+
+       add             v24.4s,v24.4s,v22.4s    /* ABCD += working copy */
+       add             v25.4s,v25.4s,v23.4s    /* EFGH += working copy */
+
+       /* this was final block */
+       cbz             x5,.Lpost_loop
+       subs            x5,x5,8
+       /* loop if hash is not finished */
+       b.ne            .Lpost_loop_Q0
+       /* set "1" of the padding if this was a final block */
+       mov             v26.b[0],w15
+
+.Lpost_loop:
+       /* Add outstanding bytes of digest source */
+       add             x11,x11,x20
+       /* Add one SHA-2 block since hash is calculated including i_key_pad */
+       add             x11,x11,#64
+       lsr             x12,x11,32              /* len_hi */
+       and             x14,x11,0xffffffff      /* len_lo */
+       lsl             x12,x12,3               /* len_hi in bits */
+       lsl             x14,x14,3               /* len_lo in bits */
+
+       mov             v29.s[3],w14            /* len_lo */
+       mov             v29.s[2],w12            /* len_hi */
+
+       rev32           v26.16b,v26.16b         /* fix endian w0 */
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       rev32           v27.16b,v27.16b         /* fix endian w1 */
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+       rev32           v28.16b,v28.16b         /* fix endian w2 */
+
+       /* skip write back if there were less than 4 AES blocks */
+       cbz             x19,1f
+       /*
+        * At this point all data should be fetched for SHA.
+        * Save remaining blocks without danger of overwriting SHA source.
+        */
+       stp             q0,q1,[x1],32
+       stp             q2,q3,[x1],32
+1:
+       /*
+        * final sha block
+        * the strategy is to combine the 0-3 aes blocks, which is faster but
+        * a little gourmand on code space.
+        */
+       cbz             x13,.Lzero_aes_blocks_left      /* none to do */
+       /* read first aes block, bump aes_ptr_in */
+       ld1             {v0.16b},[x0]
+       ld1             {v31.16b},[x0],16
+       adr             x8,.Lrcon
+       ld1             {v4.16b},[x8],16        /* key0 */
+       aesd            v0.16b,v8.16b
+       aesimc          v0.16b,v0.16b
+       ld1             {v5.16b},[x8],16        /* key1 */
+       ld1             {v6.16b},[x8],16        /* key2 */
+       aesd            v0.16b,v9.16b
+       aesimc          v0.16b,v0.16b
+       ld1             {v7.16b},[x8],16        /* key3 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       aesd            v0.16b,v10.16b
+       aesimc          v0.16b,v0.16b
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       aesd            v0.16b,v11.16b
+       aesimc          v0.16b,v0.16b
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       aesd            v0.16b,v12.16b
+       aesimc          v0.16b,v0.16b
+       sha256h2        q23, q21, v4.4s
+       sha256su1       v26.4s,v28.4s,v29.4s
+       sha256su0       v27.4s,v28.4s
+       aesd            v0.16b,v13.16b
+       aesimc          v0.16b,v0.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       aesd            v0.16b,v14.16b
+       aesimc          v0.16b,v0.16b
+       sha256su1       v27.4s,v29.4s,v26.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       aesd            v0.16b,v15.16b
+       aesimc          v0.16b,v0.16b
+       sha256h2        q23, q21, v6.4s
+       sha256su1       v28.4s,v26.4s,v27.4s
+       aesd            v0.16b,v16.16b
+       aesimc          v0.16b,v0.16b
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+___
+       &aes192_aes256_dec_handle(1,"dec_final1",0,0);
+$code.=<<___;
+       sha256su1       v29.4s,v27.4s,v28.4s
+       eor             v3.16b,v0.16b,v30.16b   /* xor w/ ivec (modeop) */
+
+       sub             x13,x13,1               /* dec counter */
+       /* save aes res, bump aes_out_ptr */
+       st1             {v3.16b},[x1],16
+       cbz             x13,.Lfrmquad1
+
+       /* aes xform 1 */
+
+       /* read first aes block, bump aes_ptr_in */
+       ld1             {v0.16b},[x0]
+       ld1             {v30.16b},[x0],16
+       ld1             {v4.16b},[x8],16        /* key4 */
+       ld1             {v5.16b},[x8],16        /* key5 */
+       ld1             {v6.16b},[x8],16        /* key6 */
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       aesd            v0.16b,v8.16b
+       aesimc          v0.16b,v0.16b
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       aesd            v0.16b,v9.16b
+       aesimc          v0.16b,v0.16b
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       aesd            v0.16b,v10.16b
+       aesimc          v0.16b,v0.16b
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       aesd            v0.16b,v11.16b
+       aesimc          v0.16b,v0.16b
+       sha256su1       v26.4s,v28.4s,v29.4s
+       sha256su0       v27.4s,v28.4s
+       aesd            v0.16b,v12.16b
+       aesimc          v0.16b,v0.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       aesd            v0.16b,v13.16b
+       aesimc          v0.16b,v0.16b
+       sha256su1       v27.4s,v29.4s,v26.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su0       v28.4s,v29.4s
+       aesd            v0.16b,v14.16b
+       aesimc          v0.16b,v0.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       aesd            v0.16b,v15.16b
+       aesimc          v0.16b,v0.16b
+       sha256su1       v28.4s,v26.4s,v27.4s
+       sha256su0       v29.4s,v26.4s
+       aesd            v0.16b,v16.16b
+       aesimc          v0.16b,v0.16b
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+___
+       &aes192_aes256_dec_handle(1,"dec_final2",0,0);
+$code.=<<___;
+       sha256su1       v29.4s,v27.4s,v28.4s
+       eor             v3.16b,v0.16b,v31.16b   /* xor w/ ivec (modeop) */
+
+       sub             x13,x13,1               /* dec counter */
+       /* save aes res, bump aes_out_ptr */
+       st1             {v3.16b},[x1],16
+       cbz             x13,.Lfrmquad2
+
+       /* aes xform 2 */
+
+       /* read first aes block, bump aes_ptr_in */
+       ld1             {v0.16b},[x0],16
+       ld1             {v4.16b},[x8],16        /* key4 */
+       ld1             {v5.16b},[x8],16        /* key5 */
+       ld1             {v6.16b},[x8],16        /* key6 */
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       aesd            v0.16b,v8.16b
+       aesimc          v0.16b,v0.16b
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       aesd            v0.16b,v9.16b
+       aesimc          v0.16b,v0.16b
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       aesd            v0.16b,v10.16b
+       aesimc          v0.16b,v0.16b
+       sha256h2        q23, q21, v4.4s
+       sha256su1       v26.4s,v28.4s,v29.4s
+       aesd            v0.16b,v11.16b
+       aesimc          v0.16b,v0.16b
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       aesd            v0.16b,v12.16b
+       aesimc          v0.16b,v0.16b
+       sha256h2        q23, q21, v5.4s
+       sha256su1       v27.4s,v29.4s,v26.4s
+       aesd            v0.16b,v13.16b
+       aesimc          v0.16b,v0.16b
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       aesd            v0.16b,v14.16b
+       aesimc          v0.16b,v0.16b
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       aesd            v0.16b,v15.16b
+       aesimc          v0.16b,v0.16b
+       sha256su1       v28.4s,v26.4s,v27.4s
+       aesd            v0.16b,v16.16b
+       aesimc          v0.16b,v0.16b
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+___
+       &aes192_aes256_dec_handle(1,"dec_final3",0,0);
+$code.=<<___;
+       sha256h2        q23, q21, v7.4s
+       sha256su1       v29.4s,v27.4s,v28.4s
+       eor             v3.16b,v0.16b,v30.16b   /* xor w/ ivec (modeop) */
+       /* save aes res, bump aes_out_ptr */
+       st1             {v3.16b},[x1],16
+       b               .Lfrmquad3
+/*
+ * the final block with no aes component, i.e from here there were zero blocks
+ */
+
+.Lzero_aes_blocks_left:
+       /* base address for sha round consts */
+       adr             x8,.Lrcon
+       ld1             {v4.16b},[x8],16        /* key0 */
+       ld1             {v5.16b},[x8],16        /* key1 */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x8],16        /* key2 */
+       sha256h2        q23, q21, v4.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x8],16        /* key3 */
+       sha256h2        q23, q21, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+/* quad 1 */
+.Lfrmquad1:
+       ld1             {v4.16b},[x8],16        /* key4 */
+       ld1             {v5.16b},[x8],16        /* key5 */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       sha256h2        q23, q21, v4.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       sha256h2        q23, q21, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+/* quad 2 */
+.Lfrmquad2:
+       ld1             {v4.16b},[x8],16        /* key4 */
+       ld1             {v5.16b},[x8],16        /* key5 */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       sha256h2        q23, q21, v4.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       sha256h2        q23, q21, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+/* quad 3 */
+.Lfrmquad3:
+       ld1             {v4.16b},[x8],16        /* key4 */
+       ld1             {v5.16b},[x8],16        /* key5 */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256h2        q23, q21, v4.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       eor             v26.16b,v26.16b,v26.16b /* zero reg */
+       sha256h2        q23, q21, v5.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       eor             v27.16b,v27.16b,v27.16b /* zero reg */
+       sha256h2        q23, q21, v6.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       eor             v28.16b,v28.16b,v28.16b /* zero reg */
+       sha256h2        q23, q21, v7.4s
+
+       add             v26.4s,v24.4s,v22.4s    /* ABCD += working copy */
+       eor             v29.16b,v29.16b,v29.16b /* zero reg */
+       add             v27.4s,v25.4s,v23.4s    /* EFGH += working copy */
+
+       /*
+        * Calculate final HMAC
+        */
+       /* base address for sha round consts */
+       adr             x8,.Lrcon
+       /* load o_key_pad partial hash */
+       ld1             {v24.16b},[x7],16
+       ld1             {v25.16b},[x7]
+
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+
+       /* Set padding 1 to the first reg */
+       mov             w11, #0x80              /* that's the 1 of the pad */
+       mov             v28.b[3], w11
+       /* size of o_key_pad + inner hash */
+       mov             x11, #64+32
+       lsl             x11, x11, 3
+       /* move length to the end of the block */
+       mov             v29.s[3], w11
+       lsr             x11, x11, 32
+       mov             v29.s[2], w11           /* and the higher part */
+
+       ld1             {v4.16b},[x8],16        /* key0 */
+       ld1             {v5.16b},[x8],16        /* key1 */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x8],16        /* key2 */
+       sha256h2        q23, q21, v4.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x8],16        /* key3 */
+       sha256h2        q23, q21, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       ld1             {v4.16b},[x8],16        /* key4 */
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       ld1             {v5.16b},[x8],16        /* key5 */
+       sha256h2        q23, q21, v7.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x8],16        /* key6 */
+       sha256h2        q23, q21, v4.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x8],16        /* key7 */
+       sha256h2        q23, q21, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       ld1             {v4.16b},[x8],16        /* key8 */
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       ld1             {v5.16b},[x8],16        /* key9 */
+       sha256h2        q23, q21, v7.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key8+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x8],16        /* key10 */
+       sha256h2        q23, q21, v4.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key9+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x8],16        /* key11 */
+       sha256h2        q23, q21, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key10+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       ld1             {v4.16b},[x8],16        /* key12 */
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key11+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       ld1             {v5.16b},[x8],16        /* key13 */
+       sha256h2        q23, q21, v7.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key12+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x8],16        /* key14 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key13+w1 */
+       sha256h2        q23, q21, v4.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x8],16        /* key15 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key14+w2 */
+
+       sha256h2        q23, q21, v5.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       sha256h         q22, q23, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key15+w3 */
+       ldp             d10,d11,[sp,#16]
+       sha256h2        q23, q21, v6.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       sha256h         q22, q23, v7.4s
+       ldp             d12,d13,[sp,#32]
+       sha256h2        q23, q21, v7.4s
+
+       add             v24.4s,v24.4s,v22.4s    /* ABCD += working copy */
+       ldp             d14,d15,[sp,#48]
+       add             v25.4s,v25.4s,v23.4s    /* EFGH += working copy */
+
+       rev32           v24.16b, v24.16b
+       ldp             x19,x20,[sp,#64]
+       ldp             d8,d9,[sp],#80
+       rev32           v25.16b, v25.16b
+       st1             {v24.4s},[x4],16
+       mov             x0, xzr
+       st1             {v25.4s},[x4]
+
+       ret
+
+/*
+ * These are the short cases (less efficient), here used for 1-11 aes blocks.
+ * x10 = aes_blocks
+ */
+.Ldec_short_cases:
+       ldp             q8,q9,[x9],32
+       adr             x8,.Lrcon               /* rcon */
+       ldp             q10,q11,[x9],32
+       lsl             x11,x10,4               /* len=aes_blocks*16 */
+
+       ldp             q12,q13,[x9],32
+       ldp             q14,q15,[x9],32
+       ld1             {v30.16b},[x6]          /* get ivec */
+       ldp             q16,q17,[x9],32
+       ld1             {v18.16b},[x9]
+
+       /* get outstanding bytes of the digest */
+       sub             x20,x5,x2
+
+       /* indicate AES blocks to write back */
+       mov             x19,xzr
+
+       mov             x2,x0
+
+       /*
+        * Digest source has to be at least of cipher source length
+        * therefore it is safe to use x10 to indicate whether we can
+        * overtake cipher processing by 4 AES block here.
+        */
+       cmp             x10,4                   /* check if 4 or more */
+       /* if less, bail to last block */
+       blt             .Llast_sha_block
+
+       sub             x5,x5,64
+
+       mov             x9,x8                   /* top of rcon */
+
+       /* quad 0 */
+       ld1             {v26.16b},[x3],16
+       ld1             {v4.16b},[x9],16        /* key0 */
+       ld1             {v27.16b},[x3],16
+       rev32           v26.16b,v26.16b
+       ld1             {v28.16b},[x3],16
+       rev32           v27.16b,v27.16b
+       ld1             {v29.16b},[x3],16
+       rev32           v28.16b,v28.16b
+       ld1             {v5.16b},[x9],16        /* key1 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       rev32           v29.16b,v29.16b
+
+       sha256su0       v26.4s,v27.4s
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x9],16        /* key2 */
+       sha256h2        q23, q21, v4.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x9],16        /* key3 */
+       sha256h2        q23, q21, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       ld1             {v4.16b},[x9],16        /* key4 */
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       ld1             {v5.16b},[x9],16        /* key5 */
+       sha256h2        q23, q21, v7.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 1 */
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x9],16        /* key6 */
+       sha256h2        q23, q21, v4.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x9],16        /* key7 */
+       sha256h2        q23, q21, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       ld1             {v4.16b},[x9],16        /* key4 */
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       ld1             {v5.16b},[x9],16        /* key5 */
+       sha256h2        q23, q21, v7.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 2 */
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x9],16        /* key6 */
+       sha256h2        q23, q21, v4.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x9],16        /* key7 */
+       sha256h2        q23, q21, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       ld1             {v4.16b},[x9],16        /* key4 */
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       ld1             {v5.16b},[x9],16        /* key5 */
+       sha256h2        q23, q21, v7.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 3 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x9],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256h2        q23, q21, v4.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x9],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256h2        q23, q21, v5.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256h2        q23, q21, v6.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+
+       add             v24.4s,v24.4s,v22.4s    /* ABCD += working copy */
+       add             v25.4s,v25.4s,v23.4s    /* EFGH += working copy */
+
+       /* there were at least 4 AES blocks to process */
+       b               .Lshort_loop_no_store
+
+.Ldec_short_loop:
+       cmp             x10,4                   /* check if 4 or more */
+       /* if less, bail to last block */
+       blt             .Llast_sha_block
+       stp             q0,q1,[x1],32
+       stp             q2,q3,[x1],32
+
+       sub             x19,x19,4
+
+.Lshort_loop_no_store:
+       ld1             {v31.16b},[x2]          /* next w no update */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v0.16b},[x2],16
+
+       add             x0,x0,64
+
+       /* aes xform 0 */
+       aesd            v0.16b,v8.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v9.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v10.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v11.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v12.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v13.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v14.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v15.16b
+       aesimc          v0.16b,v0.16b
+       aesd            v0.16b,v16.16b
+       aesimc          v0.16b,v0.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_short",0,0);
+$code.=<<___;
+       eor             v0.16b,v0.16b,v30.16b   /* xor w/prev value */
+
+       ld1             {v30.16b},[x2]          /* read no update */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v1.16b},[x2],16
+
+       /* aes xform 1 */
+       aesd            v1.16b,v8.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v9.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v10.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v11.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v12.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v13.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v14.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v15.16b
+       aesimc          v1.16b,v1.16b
+       aesd            v1.16b,v16.16b
+       aesimc          v1.16b,v1.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_short",1,0);
+$code.=<<___;
+       eor             v1.16b,v1.16b,v31.16b   /* xor w/prev value */
+
+       ld1             {v31.16b},[x2]          /* read no update */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v2.16b},[x2],16
+
+       /* aes xform 2 */
+       aesd            v2.16b,v8.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v9.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v10.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v11.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v12.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v13.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v14.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v15.16b
+       aesimc          v2.16b,v2.16b
+       aesd            v2.16b,v16.16b
+       aesimc          v2.16b,v2.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_short",2,0);
+$code.=<<___;
+       eor             v2.16b,v2.16b,v30.16b           /* xor w/prev value */
+
+       ld1             {v30.16b},[x2]                  /* read no update */
+       /* read next aes block, update aes_ptr_in */
+       ld1             {v3.16b},[x2],16
+
+       /* aes xform 3 */
+       aesd            v3.16b,v8.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v9.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v10.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v11.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v12.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v13.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v14.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v15.16b
+       aesimc          v3.16b,v3.16b
+       aesd            v3.16b,v16.16b
+       aesimc          v3.16b,v3.16b
+___
+       &aes192_aes256_dec_handle(1,"dec_short",3,0);
+$code.=<<___;
+       eor             v3.16b,v3.16b,v31.16b   /* xor w/prev value */
+
+       add             x19,x19,4
+
+       sub             x10,x10,4               /* 4 less */
+       cmp             x5,64
+       b.lt            .Ldec_short_loop        /* keep looping */
+
+       sub             x5,x5,64
+
+       mov             x9,x8                   /* top of rcon */
+
+       /* quad 0 */
+       ld1             {v26.16b},[x3],16
+       ld1             {v4.16b},[x9],16        /* key0 */
+       ld1             {v27.16b},[x3],16
+       rev32           v26.16b,v26.16b
+       ld1             {v28.16b},[x3],16
+       rev32           v27.16b,v27.16b
+       ld1             {v29.16b},[x3],16
+       rev32           v28.16b,v28.16b
+       ld1             {v5.16b},[x9],16        /* key1 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       rev32           v29.16b,v29.16b
+
+       sha256su0       v26.4s,v27.4s
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x9],16        /* key2 */
+       sha256h2        q23, q21, v4.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x9],16        /* key3 */
+       sha256h2        q23, q21, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       ld1             {v4.16b},[x9],16        /* key4 */
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       ld1             {v5.16b},[x9],16        /* key5 */
+       sha256h2        q23, q21, v7.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 1 */
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x9],16        /* key6 */
+       sha256h2        q23, q21, v4.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x9],16        /* key7 */
+       sha256h2        q23, q21, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       ld1             {v4.16b},[x9],16        /* key4 */
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       ld1             {v5.16b},[x9],16        /* key5 */
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 2 */
+       sha256su0       v26.4s,v27.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x9],16        /* key6 */
+       sha256h2        q23, q21, v4.4s
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       sha256su0       v27.4s,v28.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x9],16        /* key7 */
+       sha256h2        q23, q21, v5.4s
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       sha256su0       v28.4s,v29.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       ld1             {v4.16b},[x9],16        /* key4 */
+       sha256h2        q23, q21, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       sha256su0       v29.4s,v26.4s
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       ld1             {v5.16b},[x9],16        /* key5 */
+       sha256h2        q23, q21, v7.4s
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       /* quad 3 */
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v4.4s
+       ld1             {v6.16b},[x9],16        /* key6 */
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256h2        q23, q21, v4.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v5.4s
+       ld1             {v7.16b},[x9],16        /* key7 */
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256h2        q23, q21, v5.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v6.4s
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256h2        q23, q21, v6.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+
+       add             v24.4s,v24.4s,v22.4s    /* ABCD += working copy */
+       add             v25.4s,v25.4s,v23.4s    /* EFGH += working copy */
+
+       b               .Ldec_short_loop        /* keep looping */
+/*
+ * This is arranged so that we can join the common unwind code that does
+ * the last sha block and the final 0-3 aes blocks.
+ */
+.Llast_sha_block:
+       eor             v26.16b,v26.16b,v26.16b         /* zero the rest */
+       eor             v27.16b,v27.16b,v27.16b         /* zero the rest */
+       eor             v28.16b,v28.16b,v28.16b         /* zero the rest */
+       eor             v29.16b,v29.16b,v29.16b         /* zero the rest */
+
+       mov             x13,x10                 /* copy aes blocks for common */
+       b               .Ljoin_common           /* join common code */
+
+.size  asm_sha256_hmac_aescbc_dec, .-asm_sha256_hmac_aescbc_dec
+___
+
+if ($flavour =~ /64/) {
+       foreach(split("\n",$code)) {
+       s/\`([^\`]*)\`/eval($1)/geo;
+       print $_,"\n";
+       }
+}
+
+close STDOUT or die "error closing STDOUT: $!";
\ No newline at end of file
index 11d27d0451c739b997b6cf9e9ba3f76c464f862e..661b34592f295aecff9b6524ced83e050ac1578e 100644 (file)
@@ -31,7 +31,9 @@ IF[{- !$disabled{asm} -}]
 
   $AESASM_armv4=aes_cbc.c aes-armv4.S bsaes-armv7.S aesv8-armx.S
   $AESDEF_armv4=AES_ASM BSAES_ASM
-  $AESASM_aarch64=aes_core.c aes_cbc.c aesv8-armx.S bsaes-armv8.S vpaes-armv8.S
+  $AESASM_aarch64=\
+        aes_core.c aes_cbc.c aesv8-armx.S bsaes-armv8.S vpaes-armv8.S \
+        aes-sha1-armv8.S aes-sha256-armv8.S
   $AESDEF_aarch64=BSAES_ASM VPAES_ASM
 
   $AESASM_parisc11=aes_core.c aes_cbc.c aes-parisc.s
@@ -137,6 +139,10 @@ GENERATE[aesv8-armx.S]=asm/aesv8-armx.pl
 INCLUDE[aesv8-armx.o]=..
 GENERATE[vpaes-armv8.S]=asm/vpaes-armv8.pl
 INCLUDE[vpaes-armv8.o]=..
+GENERATE[aes-sha1-armv8.S]=asm/aes-sha1-armv8.pl
+INCLUDE[aes-sha1-armv8.o]=..
+GENERATE[aes-sha256-armv8.S]=asm/aes-sha256-armv8.pl
+INCLUDE[aes-sha256-armv8.o]=..
 
 GENERATE[aes-armv4.S]=asm/aes-armv4.pl
 INCLUDE[aes-armv4.o]=..
index 32ada929e1bec1f4e7dcdd283a4d7c80ebd93cd3..6c44b845c99637d276814ce3a4dcefd39cf8d0e0 100644 (file)
@@ -320,11 +320,12 @@ int EVP_CIPHER_get_type(const EVP_CIPHER *cipher)
 int evp_cipher_cache_constants(EVP_CIPHER *cipher)
 {
     int ok, aead = 0, custom_iv = 0, cts = 0, multiblock = 0, randkey = 0;
+    int encrypt_then_mac = 0;
     size_t ivlen = 0;
     size_t blksz = 0;
     size_t keylen = 0;
     unsigned int mode = 0;
-    OSSL_PARAM params[10];
+    OSSL_PARAM params[11];
 
     params[0] = OSSL_PARAM_construct_size_t(OSSL_CIPHER_PARAM_BLOCK_SIZE, &blksz);
     params[1] = OSSL_PARAM_construct_size_t(OSSL_CIPHER_PARAM_IVLEN, &ivlen);
@@ -338,7 +339,9 @@ int evp_cipher_cache_constants(EVP_CIPHER *cipher)
                                          &multiblock);
     params[8] = OSSL_PARAM_construct_int(OSSL_CIPHER_PARAM_HAS_RAND_KEY,
                                          &randkey);
-    params[9] = OSSL_PARAM_construct_end();
+    params[9] = OSSL_PARAM_construct_int(OSSL_CIPHER_PARAM_ENCRYPT_THEN_MAC,
+                                         &encrypt_then_mac);
+    params[10] = OSSL_PARAM_construct_end();
     ok = evp_do_ciph_getparams(cipher, params) > 0;
     if (ok) {
         cipher->block_size = blksz;
@@ -357,6 +360,8 @@ int evp_cipher_cache_constants(EVP_CIPHER *cipher)
             cipher->flags |= EVP_CIPH_FLAG_CUSTOM_CIPHER;
         if (randkey)
             cipher->flags |= EVP_CIPH_RAND_KEY;
+        if (encrypt_then_mac)
+            cipher->flags |= EVP_CIPH_FLAG_ENC_THEN_MAC;
         if (OSSL_PARAM_locate_const(EVP_CIPHER_gettable_ctx_params(cipher),
                                     OSSL_CIPHER_PARAM_ALGORITHM_ID_PARAMS))
             cipher->flags |= EVP_CIPH_FLAG_CUSTOM_ASN1;
index 8790de50dd6a52843f4076aa85852cb10d2c5ce0..4bd45d3558b1cb0f76a8092ccd275452c7cb6e65 100644 (file)
@@ -1350,7 +1350,7 @@ static const unsigned char so[9517] = {
     0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x03,0x2E,  /* [ 9507] OBJ_SLH_DSA_SHAKE_256f_WITH_SHAKE256 */
 };
 
-#define NUM_NID 1487
+#define NUM_NID 1493
 static const ASN1_OBJECT nid_objs[NUM_NID] = {
     {"UNDEF", "undefined", NID_undef},
     {"rsadsi", "RSA Data Security, Inc.", NID_rsadsi, 6, &so[0]},
@@ -2839,14 +2839,22 @@ static const ASN1_OBJECT nid_objs[NUM_NID] = {
     {"id-hash-slh-dsa-shake-192f-with-shake256", "SLH-DSA-SHAKE-192f-WITH-SHAKE256", NID_SLH_DSA_SHAKE_192f_WITH_SHAKE256, 9, &so[9489]},
     {"id-hash-slh-dsa-shake-256s-with-shake256", "SLH-DSA-SHAKE-256s-WITH-SHAKE256", NID_SLH_DSA_SHAKE_256s_WITH_SHAKE256, 9, &so[9498]},
     {"id-hash-slh-dsa-shake-256f-with-shake256", "SLH-DSA-SHAKE-256f-WITH-SHAKE256", NID_SLH_DSA_SHAKE_256f_WITH_SHAKE256, 9, &so[9507]},
+    {"AES-128-CBC-HMAC-SHA1-ETM", "aes-128-cbc-hmac-sha1-etm", NID_aes_128_cbc_hmac_sha1_etm},
+    {"AES-192-CBC-HMAC-SHA1-ETM", "aes-192-cbc-hmac-sha1-etm", NID_aes_192_cbc_hmac_sha1_etm},
+    {"AES-256-CBC-HMAC-SHA1-ETM", "aes-256-cbc-hmac-sha1-etm", NID_aes_256_cbc_hmac_sha1_etm},
+    {"AES-128-CBC-HMAC-SHA256-ETM", "aes-128-cbc-hmac-sha256-etm", NID_aes_128_cbc_hmac_sha256_etm},
+    {"AES-192-CBC-HMAC-SHA256-ETM", "aes-192-cbc-hmac-sha256-etm", NID_aes_192_cbc_hmac_sha256_etm},
+    {"AES-256-CBC-HMAC-SHA256-ETM", "aes-256-cbc-hmac-sha256-etm", NID_aes_256_cbc_hmac_sha256_etm},
 };
 
-#define NUM_SN 1478
+#define NUM_SN 1484
 static const unsigned int sn_objs[NUM_SN] = {
      364,    /* "AD_DVCS" */
      419,    /* "AES-128-CBC" */
      916,    /* "AES-128-CBC-HMAC-SHA1" */
+    1487,    /* "AES-128-CBC-HMAC-SHA1-ETM" */
      948,    /* "AES-128-CBC-HMAC-SHA256" */
+    1490,    /* "AES-128-CBC-HMAC-SHA256-ETM" */
      421,    /* "AES-128-CFB" */
      650,    /* "AES-128-CFB1" */
      653,    /* "AES-128-CFB8" */
@@ -2858,7 +2866,9 @@ static const unsigned int sn_objs[NUM_SN] = {
      913,    /* "AES-128-XTS" */
      423,    /* "AES-192-CBC" */
      917,    /* "AES-192-CBC-HMAC-SHA1" */
+    1488,    /* "AES-192-CBC-HMAC-SHA1-ETM" */
      949,    /* "AES-192-CBC-HMAC-SHA256" */
+    1491,    /* "AES-192-CBC-HMAC-SHA256-ETM" */
      425,    /* "AES-192-CFB" */
      651,    /* "AES-192-CFB1" */
      654,    /* "AES-192-CFB8" */
@@ -2869,7 +2879,9 @@ static const unsigned int sn_objs[NUM_SN] = {
     1199,    /* "AES-192-SIV" */
      427,    /* "AES-256-CBC" */
      918,    /* "AES-256-CBC-HMAC-SHA1" */
+    1489,    /* "AES-256-CBC-HMAC-SHA1-ETM" */
      950,    /* "AES-256-CBC-HMAC-SHA256" */
+    1492,    /* "AES-256-CBC-HMAC-SHA256-ETM" */
      429,    /* "AES-256-CFB" */
      652,    /* "AES-256-CFB1" */
      655,    /* "AES-256-CFB8" */
@@ -4323,7 +4335,7 @@ static const unsigned int sn_objs[NUM_SN] = {
     1289,    /* "zstd" */
 };
 
-#define NUM_LN 1478
+#define NUM_LN 1484
 static const unsigned int ln_objs[NUM_LN] = {
      363,    /* "AD Time Stamping" */
      405,    /* "ANSI X9.62" */
@@ -4758,7 +4770,9 @@ static const unsigned int ln_objs[NUM_LN] = {
      606,    /* "additional verification" */
      419,    /* "aes-128-cbc" */
      916,    /* "aes-128-cbc-hmac-sha1" */
+    1487,    /* "aes-128-cbc-hmac-sha1-etm" */
      948,    /* "aes-128-cbc-hmac-sha256" */
+    1490,    /* "aes-128-cbc-hmac-sha256-etm" */
      896,    /* "aes-128-ccm" */
      421,    /* "aes-128-cfb" */
      650,    /* "aes-128-cfb1" */
@@ -4772,7 +4786,9 @@ static const unsigned int ln_objs[NUM_LN] = {
      913,    /* "aes-128-xts" */
      423,    /* "aes-192-cbc" */
      917,    /* "aes-192-cbc-hmac-sha1" */
+    1488,    /* "aes-192-cbc-hmac-sha1-etm" */
      949,    /* "aes-192-cbc-hmac-sha256" */
+    1491,    /* "aes-192-cbc-hmac-sha256-etm" */
      899,    /* "aes-192-ccm" */
      425,    /* "aes-192-cfb" */
      651,    /* "aes-192-cfb1" */
@@ -4785,7 +4801,9 @@ static const unsigned int ln_objs[NUM_LN] = {
     1199,    /* "aes-192-siv" */
      427,    /* "aes-256-cbc" */
      918,    /* "aes-256-cbc-hmac-sha1" */
+    1489,    /* "aes-256-cbc-hmac-sha1-etm" */
      950,    /* "aes-256-cbc-hmac-sha256" */
+    1492,    /* "aes-256-cbc-hmac-sha256-etm" */
      902,    /* "aes-256-ccm" */
      429,    /* "aes-256-cfb" */
      652,    /* "aes-256-cfb1" */
index 80413e087a0969ab4944c4d2dc6837ab8633fe29..b43639311120a3c2c3b850799a65724ddede90c0 100644 (file)
@@ -1484,3 +1484,9 @@ SLH_DSA_SHAKE_192s_WITH_SHAKE256          1483
 SLH_DSA_SHAKE_192f_WITH_SHAKE256               1484
 SLH_DSA_SHAKE_256s_WITH_SHAKE256               1485
 SLH_DSA_SHAKE_256f_WITH_SHAKE256               1486
+aes_128_cbc_hmac_sha1_etm              1487
+aes_192_cbc_hmac_sha1_etm              1488
+aes_256_cbc_hmac_sha1_etm              1489
+aes_128_cbc_hmac_sha256_etm            1490
+aes_192_cbc_hmac_sha256_etm            1491
+aes_256_cbc_hmac_sha256_etm            1492
index 06fe6f4bdadf36b1341a951560112e553255c945..9c61c4a642a5fbf8c48204fa3144ed681cc5e7e1 100644 (file)
@@ -1721,6 +1721,12 @@ sm-scheme 104 10        : SM4-XTS             : sm4-xts
                        : AES-256-CBC-HMAC-SHA256       : aes-256-cbc-hmac-sha256
                        : ChaCha20-Poly1305             : chacha20-poly1305
                        : ChaCha20                      : chacha20
+                       : AES-128-CBC-HMAC-SHA1-ETM     : aes-128-cbc-hmac-sha1-etm
+                       : AES-192-CBC-HMAC-SHA1-ETM     : aes-192-cbc-hmac-sha1-etm
+                       : AES-256-CBC-HMAC-SHA1-ETM     : aes-256-cbc-hmac-sha1-etm
+                       : AES-128-CBC-HMAC-SHA256-ETM   : aes-128-cbc-hmac-sha256-etm
+                       : AES-192-CBC-HMAC-SHA256-ETM   : aes-192-cbc-hmac-sha256-etm
+                       : AES-256-CBC-HMAC-SHA256-ETM   : aes-256-cbc-hmac-sha256-etm
 
 ISO-US 10046 2 1       : dhpublicnumber                : X9.42 DH
 
index 140b78e7cd97507570884647d86946f08e7ccb33..34aa74ecb2562f0d0719b0b9786ca255e0cf6b39 100644 (file)
@@ -92,8 +92,8 @@ void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len);
 #   endif /* OPENSSL_SYS_AIX || OPENSSL_SYS_MACOSX */
 #  endif /* PPC */
 
-#  if (defined(__arm__) || defined(__arm) || defined(__aarch64__) || defined(_M_ARM64)) 
-#   include "arm_arch.h"
+#  if (defined(__arm__) || defined(__arm) || defined(__aarch64__) || defined(_M_ARM64))
+#   include "crypto/arm_arch.h"
 #   if __ARM_MAX_ARCH__>=7
 #    if defined(BSAES_ASM)
 #     define BSAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
@@ -112,6 +112,13 @@ void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len);
 #     define ARMv8_HWAES_CAPABLE (OPENSSL_armcap_P & ARMV8_AES)
 #     define HWAES_xts_encrypt aes_v8_xts_encrypt
 #     define HWAES_xts_decrypt aes_v8_xts_decrypt
+#     define HWAES_CBC_HMAC_SHA1_ETM_CAPABLE (HWAES_CAPABLE && \
+                                              (OPENSSL_armcap_P & ARMV8_SHA1))
+#     define HWAES_CBC_HMAC_SHA256_ETM_CAPABLE (HWAES_CAPABLE && \
+                                                (OPENSSL_armcap_P & ARMV8_SHA256))
+#     ifndef __AARCH64EB__
+#      define AES_CBC_HMAC_SHA_ETM_CAPABLE 1
+#     endif
 #    endif
 #    define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks
 #    define HWAES_ctr32_encrypt_blocks_unroll12_eor3 aes_v8_ctr32_encrypt_blocks_unroll12_eor3
index 8e38ab29ea365b3e5545694f37f79ecaf6cb6eed..4fe841da35ed50b1e0ef6ccbeccdad4fa081df70 100644 (file)
@@ -376,6 +376,7 @@ OSSL_DEPRECATEDIN_3_0 int
 /* For supplementary wrap cipher support */
 # define         EVP_CIPH_FLAG_GET_WRAP_CIPHER   0x4000000
 # define         EVP_CIPH_FLAG_INVERSE_CIPHER    0x8000000
+# define         EVP_CIPH_FLAG_ENC_THEN_MAC      0x10000000
 
 /*
  * Cipher context flag to indicate we can handle wrap mode: if allowed in
index ee71e26458ff72c64eb9b903186cff5d05a78243..0f3d79a889f197a0b64e9e141f3f4288017dff3b 100644 (file)
 #define LN_chacha20             "chacha20"
 #define NID_chacha20            1019
 
+#define SN_aes_128_cbc_hmac_sha1_etm            "AES-128-CBC-HMAC-SHA1-ETM"
+#define LN_aes_128_cbc_hmac_sha1_etm            "aes-128-cbc-hmac-sha1-etm"
+#define NID_aes_128_cbc_hmac_sha1_etm           1487
+
+#define SN_aes_192_cbc_hmac_sha1_etm            "AES-192-CBC-HMAC-SHA1-ETM"
+#define LN_aes_192_cbc_hmac_sha1_etm            "aes-192-cbc-hmac-sha1-etm"
+#define NID_aes_192_cbc_hmac_sha1_etm           1488
+
+#define SN_aes_256_cbc_hmac_sha1_etm            "AES-256-CBC-HMAC-SHA1-ETM"
+#define LN_aes_256_cbc_hmac_sha1_etm            "aes-256-cbc-hmac-sha1-etm"
+#define NID_aes_256_cbc_hmac_sha1_etm           1489
+
+#define SN_aes_128_cbc_hmac_sha256_etm          "AES-128-CBC-HMAC-SHA256-ETM"
+#define LN_aes_128_cbc_hmac_sha256_etm          "aes-128-cbc-hmac-sha256-etm"
+#define NID_aes_128_cbc_hmac_sha256_etm         1490
+
+#define SN_aes_192_cbc_hmac_sha256_etm          "AES-192-CBC-HMAC-SHA256-ETM"
+#define LN_aes_192_cbc_hmac_sha256_etm          "aes-192-cbc-hmac-sha256-etm"
+#define NID_aes_192_cbc_hmac_sha256_etm         1491
+
+#define SN_aes_256_cbc_hmac_sha256_etm          "AES-256-CBC-HMAC-SHA256-ETM"
+#define LN_aes_256_cbc_hmac_sha256_etm          "aes-256-cbc-hmac-sha256-etm"
+#define NID_aes_256_cbc_hmac_sha256_etm         1492
+
 #define SN_dhpublicnumber               "dhpublicnumber"
 #define LN_dhpublicnumber               "X9.42 DH"
 #define NID_dhpublicnumber              920
index 4a1a043a84e9d1e4d529312162ffad0920ca29ef..7621b4b1d62703d87ed42ead556ea3b70e61e8e4 100644 (file)
@@ -14,6 +14,8 @@ const OSSL_CORE_HANDLE *FIPS_get_core_handle(OSSL_LIB_CTX *ctx);
 
 int ossl_cipher_capable_aes_cbc_hmac_sha1(void);
 int ossl_cipher_capable_aes_cbc_hmac_sha256(void);
+int ossl_cipher_capable_aes_cbc_hmac_sha1_etm(void);
+int ossl_cipher_capable_aes_cbc_hmac_sha256_etm(void);
 
 OSSL_FUNC_provider_get_capabilities_fn ossl_prov_get_capabilities;
 
index eee2178b4168b75fb375359983436703502360a5..6e33f6ee66b4e39fb08b68d325fc3aaab5149cd4 100644 (file)
@@ -229,6 +229,18 @@ static const OSSL_ALGORITHM_CAPABLE deflt_ciphers[] = {
         ossl_cipher_capable_aes_cbc_hmac_sha256),
     ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA256, ossl_aes256cbc_hmac_sha256_functions,
          ossl_cipher_capable_aes_cbc_hmac_sha256),
+    ALGC(PROV_NAMES_AES_128_CBC_HMAC_SHA1_ETM, ossl_aes128cbc_hmac_sha1_etm_functions,
+         ossl_cipher_capable_aes_cbc_hmac_sha1_etm),
+    ALGC(PROV_NAMES_AES_192_CBC_HMAC_SHA1_ETM, ossl_aes192cbc_hmac_sha1_etm_functions,
+         ossl_cipher_capable_aes_cbc_hmac_sha1_etm),
+    ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA1_ETM, ossl_aes256cbc_hmac_sha1_etm_functions,
+         ossl_cipher_capable_aes_cbc_hmac_sha1_etm),
+    ALGC(PROV_NAMES_AES_128_CBC_HMAC_SHA256_ETM, ossl_aes128cbc_hmac_sha256_etm_functions,
+        ossl_cipher_capable_aes_cbc_hmac_sha256_etm),
+    ALGC(PROV_NAMES_AES_192_CBC_HMAC_SHA256_ETM, ossl_aes192cbc_hmac_sha256_etm_functions,
+        ossl_cipher_capable_aes_cbc_hmac_sha256_etm),
+    ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA256_ETM, ossl_aes256cbc_hmac_sha256_etm_functions,
+         ossl_cipher_capable_aes_cbc_hmac_sha256_etm),
 #ifndef OPENSSL_NO_ARIA
     ALG(PROV_NAMES_ARIA_256_GCM, ossl_aria256gcm_functions),
     ALG(PROV_NAMES_ARIA_192_GCM, ossl_aria192gcm_functions),
index 373cd1c2e4c5beee7ad11fe2bb9cc8bff21f11e2..03258fc97f5ca9a8e232c4aea2851102b8067aef 100644 (file)
@@ -354,6 +354,18 @@ static const OSSL_ALGORITHM_CAPABLE fips_ciphers[] = {
          ossl_cipher_capable_aes_cbc_hmac_sha256),
     ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA256, ossl_aes256cbc_hmac_sha256_functions,
          ossl_cipher_capable_aes_cbc_hmac_sha256),
+    ALGC(PROV_NAMES_AES_128_CBC_HMAC_SHA1_ETM, ossl_aes128cbc_hmac_sha1_etm_functions,
+         ossl_cipher_capable_aes_cbc_hmac_sha1_etm),
+    ALGC(PROV_NAMES_AES_192_CBC_HMAC_SHA1_ETM, ossl_aes192cbc_hmac_sha1_etm_functions,
+         ossl_cipher_capable_aes_cbc_hmac_sha1_etm),
+    ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA1_ETM, ossl_aes256cbc_hmac_sha1_etm_functions,
+         ossl_cipher_capable_aes_cbc_hmac_sha1_etm),
+    ALGC(PROV_NAMES_AES_128_CBC_HMAC_SHA256_ETM, ossl_aes128cbc_hmac_sha256_etm_functions,
+         ossl_cipher_capable_aes_cbc_hmac_sha256_etm),
+    ALGC(PROV_NAMES_AES_192_CBC_HMAC_SHA256_ETM, ossl_aes192cbc_hmac_sha256_etm_functions,
+         ossl_cipher_capable_aes_cbc_hmac_sha256_etm),
+    ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA256_ETM, ossl_aes256cbc_hmac_sha256_etm_functions,
+         ossl_cipher_capable_aes_cbc_hmac_sha256_etm),
 #ifndef OPENSSL_NO_DES
     ALG(PROV_NAMES_DES_EDE3_ECB, ossl_tdes_ede3_ecb_functions),
     ALG(PROV_NAMES_DES_EDE3_CBC, ossl_tdes_ede3_cbc_functions),
index 1837070c211178e6c92f2c5c8f97c2a86011d9f9..47c140ace110c3b2a916f90915c9922fa2b8faba 100644 (file)
@@ -105,6 +105,9 @@ SOURCE[$AES_GOAL]=\
         cipher_aes_wrp.c \
         cipher_aes_cbc_hmac_sha.c \
         cipher_aes_cbc_hmac_sha256_hw.c cipher_aes_cbc_hmac_sha1_hw.c \
+        cipher_aes_cbc_hmac_sha_etm.c \
+        cipher_aes_cbc_hmac_sha1_etm_hw.c \
+        cipher_aes_cbc_hmac_sha256_etm_hw.c \
         cipher_cts.c
 DEFINE[$AES_GOAL]=$AESXTSDEF
 
diff --git a/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha1_etm_hw.c b/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha1_etm_hw.c
new file mode 100644 (file)
index 0000000..5d164ff
--- /dev/null
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2023-2024 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+#include "internal/deprecated.h"
+#include "cipher_aes_cbc_hmac_sha_etm.h"
+
+#if !defined(AES_CBC_HMAC_SHA_ETM_CAPABLE)
+int ossl_cipher_capable_aes_cbc_hmac_sha1_etm(void)
+{
+    return 0;
+}
+
+const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *ossl_prov_cipher_hw_aes_cbc_hmac_sha1_etm(void)
+{
+    return NULL;
+}
+#else
+void sha1_block_data_order(void *c, const void *p, size_t len);
+
+# if defined(__aarch64__)
+int asm_aescbc_sha1_hmac(const uint8_t *csrc, uint8_t *cdst, uint64_t clen,
+                         uint8_t *dsrc, uint8_t *ddst, uint64_t dlen,
+                         CIPH_DIGEST *arg);
+void asm_sha1_hmac_aescbc_dec(const uint8_t *csrc, uint8_t *cdst, uint64_t clen,
+                              const unsigned char *dsrc, uint8_t *ddst, size_t dlen,
+                              CIPH_DIGEST *arg);
+#  define HWAES128_ENC_CBC_SHA1_ETM asm_aescbc_sha1_hmac
+#  define HWAES128_DEC_CBC_SHA1_ETM asm_sha1_hmac_aescbc_dec
+# endif
+
+int ossl_cipher_capable_aes_cbc_hmac_sha1_etm(void)
+{
+    return HWAES_CBC_HMAC_SHA1_ETM_CAPABLE;
+}
+
+static int aes_cbc_hmac_sha1_init_key(PROV_CIPHER_CTX *vctx,
+                                      const unsigned char *key, size_t keylen)
+{
+    int ret;
+    PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx;
+    PROV_AES_HMAC_SHA1_ETM_CTX *sctx = (PROV_AES_HMAC_SHA1_ETM_CTX *)vctx;
+
+    if (ctx->base.enc)
+        ret = aes_v8_set_encrypt_key(key, keylen * 8, &ctx->ks);
+    else
+        ret = aes_v8_set_decrypt_key(key, keylen * 8, &ctx->ks);
+
+    SHA1_Init(&sctx->head);      /* handy when benchmarking */
+    sctx->tail = sctx->head;
+
+    return ret < 0 ? 0 : 1;
+}
+
+static void ciph_digest_arg_init(CIPH_DIGEST *arg, PROV_CIPHER_CTX *vctx)
+{
+    PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx;
+    PROV_AES_HMAC_SHA1_ETM_CTX *sctx = (PROV_AES_HMAC_SHA1_ETM_CTX *)vctx;
+
+    arg->cipher.key = (uint8_t *)&(ctx->ks);
+    arg->cipher.key_rounds = ctx->ks.rounds;
+    arg->cipher.iv = (uint8_t *)&(ctx->base.iv);
+    arg->digest.hmac.i_key_pad = (uint8_t *)&(sctx->head);
+    arg->digest.hmac.o_key_pad = (uint8_t *)&(sctx->tail);
+}
+
+static int hwaes_cbc_hmac_sha1_etm(PROV_CIPHER_CTX *vctx,
+                                   unsigned char *out,
+                                   const unsigned char *in, size_t len)
+{
+    PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx;
+    CIPH_DIGEST arg = {0};
+    ciph_digest_arg_init(&arg, vctx);
+    if (len % AES_BLOCK_SIZE) {
+        ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_INPUT_LENGTH);
+        return 0;
+    }
+    if (ctx->base.enc) {
+        HWAES128_ENC_CBC_SHA1_ETM(in, out, len, out, ctx->tag, len, &arg);
+        return 1;
+    } else {
+        if (ctx->taglen == 0) {
+            ERR_raise(ERR_LIB_PROV, PROV_R_TAG_NOT_SET);
+            return 0;
+        }
+        HWAES128_DEC_CBC_SHA1_ETM(in, out, len, in, ctx->tag, len, &arg);
+        if (CRYPTO_memcmp(ctx->exp_tag, ctx->tag, ctx->taglen)) {
+            ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_TAG);
+            return 0;
+        }
+        return 1;
+    }
+}
+
+static void sha1_update(SHA_CTX *c, const void *data, size_t len)
+{
+    const unsigned char *ptr = data;
+    size_t res;
+
+    if ((res = c->num)) {
+        res = SHA_CBLOCK - res;
+        if (len < res)
+            res = len;
+        SHA1_Update(c, ptr, res);
+        ptr += res;
+        len -= res;
+    }
+
+    res = len % SHA_CBLOCK;
+    len -= res;
+
+    if (len) {
+        sha1_block_data_order(c, ptr, len / SHA_CBLOCK);
+
+        ptr += len;
+        c->Nh += len >> 29;
+        c->Nl += len <<= 3;
+        if (c->Nl < (unsigned int)len)
+            c->Nh++;
+    }
+
+    if (res)
+        SHA1_Update(c, ptr, res);
+}
+
+static void aes_cbc_hmac_sha1_set_mac_key(void *vctx,
+                                          const unsigned char *mac, size_t len)
+{
+    PROV_AES_HMAC_SHA1_ETM_CTX *ctx = (PROV_AES_HMAC_SHA1_ETM_CTX *)vctx;
+    unsigned int i;
+    unsigned char hmac_key[64];
+
+    memset(hmac_key, 0, sizeof(hmac_key));
+
+    if (len > (int)sizeof(hmac_key)) {
+        SHA1_Init(&ctx->head);
+        sha1_update(&ctx->head, mac, len);
+        SHA1_Final(hmac_key, &ctx->head);
+    } else {
+        memcpy(hmac_key, mac, len);
+    }
+
+    for (i = 0; i < sizeof(hmac_key); i++)
+        hmac_key[i] ^= 0x36; /* ipad */
+    SHA1_Init(&ctx->head);
+    sha1_update(&ctx->head, hmac_key, sizeof(hmac_key));
+
+    for (i = 0; i < sizeof(hmac_key); i++)
+        hmac_key[i] ^= 0x36 ^ 0x5c; /* opad */
+    SHA1_Init(&ctx->tail);
+    sha1_update(&ctx->tail, hmac_key, sizeof(hmac_key));
+
+    OPENSSL_cleanse(hmac_key, sizeof(hmac_key));
+}
+
+static int aes_cbc_hmac_sha1_cipher(PROV_CIPHER_CTX *vctx,
+                                    unsigned char *out,
+                                    const unsigned char *in, size_t len)
+{
+    return hwaes_cbc_hmac_sha1_etm(vctx, out, in, len);
+}
+
+static const PROV_CIPHER_HW_AES_HMAC_SHA_ETM cipher_hw_aes_hmac_sha1_etm = {
+    {
+      aes_cbc_hmac_sha1_init_key,
+      aes_cbc_hmac_sha1_cipher
+    },
+    aes_cbc_hmac_sha1_set_mac_key
+};
+
+const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *ossl_prov_cipher_hw_aes_cbc_hmac_sha1_etm(void)
+{
+    return &cipher_hw_aes_hmac_sha1_etm;
+}
+#endif
diff --git a/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha256_etm_hw.c b/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha256_etm_hw.c
new file mode 100644 (file)
index 0000000..8a5474f
--- /dev/null
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2023-2024 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+#include "internal/deprecated.h"
+#include "cipher_aes_cbc_hmac_sha_etm.h"
+
+#if !defined(AES_CBC_HMAC_SHA_ETM_CAPABLE)
+int ossl_cipher_capable_aes_cbc_hmac_sha256_etm(void)
+{
+    return 0;
+}
+
+const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *ossl_prov_cipher_hw_aes_cbc_hmac_sha256_etm(void)
+{
+    return NULL;
+}
+#else
+void sha256_block_data_order(void *c, const void *p, size_t len);
+
+# if defined(__aarch64__)
+int asm_aescbc_sha256_hmac(const uint8_t *csrc, uint8_t *cdst, uint64_t clen,
+                         uint8_t *dsrc, uint8_t *ddst, uint64_t dlen,
+                         CIPH_DIGEST *arg);
+void asm_sha256_hmac_aescbc_dec(const uint8_t *csrc, uint8_t *cdst, uint64_t clen,
+                              const unsigned char *dsrc, uint8_t *ddst, size_t dlen,
+                              CIPH_DIGEST *arg);
+#  define HWAES128_ENC_CBC_SHA256_ETM asm_aescbc_sha256_hmac
+#  define HWAES128_DEC_CBC_SHA256_ETM asm_sha256_hmac_aescbc_dec
+# endif
+
+int ossl_cipher_capable_aes_cbc_hmac_sha256_etm(void)
+{
+    return HWAES_CBC_HMAC_SHA256_ETM_CAPABLE;
+}
+
+static int aes_cbc_hmac_sha256_init_key(PROV_CIPHER_CTX *vctx,
+                                        const unsigned char *key, size_t keylen)
+{
+    int ret;
+    PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx;
+    PROV_AES_HMAC_SHA256_ETM_CTX *sctx = (PROV_AES_HMAC_SHA256_ETM_CTX *)vctx;
+
+    if (ctx->base.enc)
+        ret = aes_v8_set_encrypt_key(key, keylen * 8, &ctx->ks);
+    else
+        ret = aes_v8_set_decrypt_key(key, keylen * 8, &ctx->ks);
+
+    SHA256_Init(&sctx->head);      /* handy when benchmarking */
+    sctx->tail = sctx->head;
+
+    return ret < 0 ? 0 : 1;
+}
+
+static void ciph_digest_arg_init(CIPH_DIGEST *arg, PROV_CIPHER_CTX *vctx)
+{
+    PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx;
+    PROV_AES_HMAC_SHA256_ETM_CTX *sctx = (PROV_AES_HMAC_SHA256_ETM_CTX *)vctx;
+
+    arg->cipher.key = (uint8_t *)&(ctx->ks);
+    arg->cipher.key_rounds = ctx->ks.rounds;
+    arg->cipher.iv = (uint8_t *)&(ctx->base.iv);
+    arg->digest.hmac.i_key_pad = (uint8_t *)&(sctx->head);
+    arg->digest.hmac.o_key_pad = (uint8_t *)&(sctx->tail);
+}
+
+static int hwaes_cbc_hmac_sha256_etm(PROV_CIPHER_CTX *vctx,
+                                     unsigned char *out,
+                                     const unsigned char *in, size_t len)
+{
+    PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx;
+    CIPH_DIGEST arg = {0};
+    ciph_digest_arg_init(&arg, vctx);
+    if (len % AES_BLOCK_SIZE) {
+        ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_INPUT_LENGTH);
+        return 0;
+    }
+    if (ctx->base.enc) {
+        HWAES128_ENC_CBC_SHA256_ETM(in, out, len, out, ctx->tag, len, &arg);
+        return 1;
+    } else {
+        if (ctx->taglen == 0) {
+            ERR_raise(ERR_LIB_PROV, PROV_R_TAG_NOT_SET);
+            return 0;
+        }
+        HWAES128_DEC_CBC_SHA256_ETM(in, out, len, in, ctx->tag, len, &arg);
+        if (CRYPTO_memcmp(ctx->exp_tag, ctx->tag, ctx->taglen)) {
+            ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_TAG);
+            return 0;
+        }
+        return 1;
+    }
+}
+
+static void sha256_update(SHA256_CTX *c, const void *data, size_t len)
+{
+    const unsigned char *ptr = data;
+    size_t res;
+
+    if ((res = c->num)) {
+        res = SHA256_CBLOCK - res;
+        if (len < res)
+            res = len;
+        SHA256_Update(c, ptr, res);
+        ptr += res;
+        len -= res;
+    }
+
+    res = len % SHA256_CBLOCK;
+    len -= res;
+
+    if (len) {
+        sha256_block_data_order(c, ptr, len / SHA256_CBLOCK);
+
+        ptr += len;
+        c->Nh += len >> 29;
+        c->Nl += len <<= 3;
+        if (c->Nl < (unsigned int)len)
+            c->Nh++;
+    }
+
+    if (res)
+        SHA256_Update(c, ptr, res);
+}
+
+static void aes_cbc_hmac_sha256_set_mac_key(void *vctx,
+                                            const unsigned char *mac, size_t len)
+{
+    PROV_AES_HMAC_SHA256_ETM_CTX *ctx = (PROV_AES_HMAC_SHA256_ETM_CTX *)vctx;
+    unsigned int i;
+    unsigned char hmac_key[64];
+
+    memset(hmac_key, 0, sizeof(hmac_key));
+
+    if (len > (int)sizeof(hmac_key)) {
+        SHA256_Init(&ctx->head);
+        sha256_update(&ctx->head, mac, len);
+        SHA256_Final(hmac_key, &ctx->head);
+    } else {
+        memcpy(hmac_key, mac, len);
+    }
+
+    for (i = 0; i < sizeof(hmac_key); i++)
+        hmac_key[i] ^= 0x36; /* ipad */
+    SHA256_Init(&ctx->head);
+    sha256_update(&ctx->head, hmac_key, sizeof(hmac_key));
+
+    for (i = 0; i < sizeof(hmac_key); i++)
+        hmac_key[i] ^= 0x36 ^ 0x5c; /* opad */
+    SHA256_Init(&ctx->tail);
+    sha256_update(&ctx->tail, hmac_key, sizeof(hmac_key));
+
+    OPENSSL_cleanse(hmac_key, sizeof(hmac_key));
+}
+
+static int aes_cbc_hmac_sha256_cipher(PROV_CIPHER_CTX *vctx,
+                                      unsigned char *out,
+                                      const unsigned char *in, size_t len)
+{
+    return hwaes_cbc_hmac_sha256_etm(vctx, out, in, len);
+}
+
+static const PROV_CIPHER_HW_AES_HMAC_SHA_ETM cipher_hw_aes_hmac_sha256_etm = {
+    {
+      aes_cbc_hmac_sha256_init_key,
+      aes_cbc_hmac_sha256_cipher
+    },
+    aes_cbc_hmac_sha256_set_mac_key
+};
+
+const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *ossl_prov_cipher_hw_aes_cbc_hmac_sha256_etm(void)
+{
+    return &cipher_hw_aes_hmac_sha256_etm;
+}
+#endif
diff --git a/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha_etm.c b/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha_etm.c
new file mode 100644 (file)
index 0000000..0292511
--- /dev/null
@@ -0,0 +1,310 @@
+/*
+ * Copyright 2024 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+#include "internal/deprecated.h"
+
+#include "cipher_aes_cbc_hmac_sha_etm.h"
+#include "prov/providercommon.h"
+#include "prov/ciphercommon_aead.h"
+#include "prov/implementations.h"
+
+#ifndef AES_CBC_HMAC_SHA_ETM_CAPABLE
+# define IMPLEMENT_CIPHER(nm, sub, kbits, blkbits, ivbits, flags)              \
+const OSSL_DISPATCH ossl_##nm##kbits##sub##_functions[] = {                    \
+    OSSL_DISPATCH_END                                                              \
+};
+#else
+static OSSL_FUNC_cipher_encrypt_init_fn aes_einit;
+static OSSL_FUNC_cipher_decrypt_init_fn aes_dinit;
+static OSSL_FUNC_cipher_gettable_ctx_params_fn aes_gettable_ctx_params;
+static OSSL_FUNC_cipher_settable_ctx_params_fn aes_settable_ctx_params;
+# define aes_gettable_params ossl_cipher_generic_gettable_params
+# define aes_update ossl_cipher_generic_stream_update
+# define aes_final ossl_cipher_generic_stream_final
+# define aes_cipher ossl_cipher_generic_cipher
+
+static int aes_set_ctx_params(void *vctx, const OSSL_PARAM params[])
+{
+    PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx;
+    PROV_CIPHER_HW_AES_HMAC_SHA_ETM *hw =
+       (PROV_CIPHER_HW_AES_HMAC_SHA_ETM *)ctx->hw;
+    const OSSL_PARAM *p;
+
+    if (params == NULL)
+        return 1;
+
+    p = OSSL_PARAM_locate_const(params, OSSL_CIPHER_PARAM_AEAD_MAC_KEY);
+    if (p != NULL) {
+        if (p->data_type != OSSL_PARAM_OCTET_STRING) {
+            ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_GET_PARAMETER);
+            return 0;
+        }
+        hw->init_mac_key(ctx, p->data, p->data_size);
+    }
+
+    p = OSSL_PARAM_locate_const(params, OSSL_CIPHER_PARAM_KEYLEN);
+    if (p != NULL) {
+        size_t keylen;
+
+        if (!OSSL_PARAM_get_size_t(p, &keylen)) {
+            ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_GET_PARAMETER);
+            return 0;
+        }
+        if (ctx->base.keylen != keylen) {
+            ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_KEY_LENGTH);
+            return 0;
+        }
+    }
+
+    p = OSSL_PARAM_locate_const(params, OSSL_CIPHER_HMAC_PARAM_MAC);
+    if (p != NULL) {
+        size_t sz;
+        void *vp;
+
+        vp = &ctx->exp_tag;
+        if (!OSSL_PARAM_get_octet_string(p, &vp, AES_CBC_MAX_HMAC_SIZE, &sz)) {
+            ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_GET_PARAMETER);
+            return 0;
+        }
+        if (sz == 0) {
+            ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_TAG);
+            return 0;
+        }
+        ctx->taglen = sz;
+    }
+
+    return 1;
+}
+
+static int aes_einit(void *ctx, const unsigned char *key, size_t keylen,
+                          const unsigned char *iv, size_t ivlen,
+                          const OSSL_PARAM params[])
+{
+    if (!ossl_cipher_generic_einit(ctx, key, keylen, iv, ivlen, NULL))
+        return 0;
+    return aes_set_ctx_params(ctx, params);
+}
+
+static int aes_dinit(void *ctx, const unsigned char *key, size_t keylen,
+                          const unsigned char *iv, size_t ivlen,
+                          const OSSL_PARAM params[])
+{
+    if (!ossl_cipher_generic_dinit(ctx, key, keylen, iv, ivlen, NULL))
+        return 0;
+    return aes_set_ctx_params(ctx, params);
+}
+
+static int aes_get_ctx_params(void *vctx, OSSL_PARAM params[])
+{
+    PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx;
+    OSSL_PARAM *p;
+    size_t sz;
+
+    p = OSSL_PARAM_locate(params, OSSL_CIPHER_PARAM_KEYLEN);
+    if (p != NULL && !OSSL_PARAM_set_size_t(p, ctx->base.keylen)) {
+        ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER);
+        return 0;
+    }
+    p = OSSL_PARAM_locate(params, OSSL_CIPHER_PARAM_IVLEN);
+    if (p != NULL && !OSSL_PARAM_set_size_t(p, ctx->base.ivlen)) {
+        ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER);
+        return 0;
+    }
+    p = OSSL_PARAM_locate(params, OSSL_CIPHER_PARAM_IV);
+    if (p != NULL
+        && !OSSL_PARAM_set_octet_string(p, ctx->base.oiv, ctx->base.ivlen)
+        && !OSSL_PARAM_set_octet_ptr(p, &ctx->base.oiv, ctx->base.ivlen)) {
+        ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER);
+        return 0;
+    }
+    p = OSSL_PARAM_locate(params, OSSL_CIPHER_PARAM_UPDATED_IV);
+    if (p != NULL
+        && !OSSL_PARAM_set_octet_string(p, ctx->base.iv, ctx->base.ivlen)
+        && !OSSL_PARAM_set_octet_ptr(p, &ctx->base.iv, ctx->base.ivlen)) {
+        ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER);
+        return 0;
+    }
+    p = OSSL_PARAM_locate(params, OSSL_CIPHER_HMAC_PARAM_MAC);
+    if (p != NULL) {
+        sz = p->data_size;
+        if (sz == 0
+            || sz > AES_CBC_MAX_HMAC_SIZE
+            || !ctx->base.enc
+            || ctx->taglen == UNINITIALISED_SIZET) {
+            ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_TAG);
+            return 0;
+        }
+        if (!OSSL_PARAM_set_octet_string(p, ctx->tag, sz)) {
+            ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER);
+            return 0;
+        }
+    }
+    return 1;
+}
+
+static const OSSL_PARAM cipher_aes_known_gettable_ctx_params[] = {
+    OSSL_PARAM_size_t(OSSL_CIPHER_PARAM_KEYLEN, NULL),
+    OSSL_PARAM_size_t(OSSL_CIPHER_PARAM_IVLEN, NULL),
+    OSSL_PARAM_octet_string(OSSL_CIPHER_PARAM_IV, NULL, 0),
+    OSSL_PARAM_octet_string(OSSL_CIPHER_PARAM_UPDATED_IV, NULL, 0),
+    OSSL_PARAM_END
+};
+
+const OSSL_PARAM *aes_gettable_ctx_params(ossl_unused void *cctx,
+                                          ossl_unused void *provctx)
+{
+    return cipher_aes_known_gettable_ctx_params;
+}
+
+static const OSSL_PARAM cipher_aes_known_settable_ctx_params[] = {
+    OSSL_PARAM_octet_string(OSSL_CIPHER_PARAM_AEAD_MAC_KEY, NULL, 0),
+    OSSL_PARAM_octet_string(OSSL_CIPHER_PARAM_AEAD_TLS1_AAD, NULL, 0),
+    OSSL_PARAM_size_t(OSSL_CIPHER_PARAM_KEYLEN, NULL),
+    OSSL_PARAM_END
+};
+
+const OSSL_PARAM *aes_settable_ctx_params(ossl_unused void *cctx,
+                                          ossl_unused void *provctx)
+{
+    return cipher_aes_known_settable_ctx_params;
+}
+
+static void base_ctx_init(void *provctx, PROV_AES_HMAC_SHA_ETM_CTX *ctx,
+                          const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *meths,
+                          size_t kbits, size_t blkbits, size_t ivbits,
+                          uint64_t flags)
+{
+    ossl_cipher_generic_initkey(&ctx->base, kbits, blkbits, ivbits,
+                                EVP_CIPH_CBC_MODE, flags,
+                                &meths->base, provctx);
+    ctx->hw = (PROV_CIPHER_HW_AES_HMAC_SHA_ETM *)ctx->base.hw;
+}
+
+static void *aes_cbc_hmac_sha1_etm_newctx(void *provctx, size_t kbits,
+                                          size_t blkbits, size_t ivbits,
+                                          uint64_t flags)
+{
+    PROV_AES_HMAC_SHA1_ETM_CTX *ctx;
+
+    if (!ossl_prov_is_running())
+        return NULL;
+
+    ctx = OPENSSL_zalloc(sizeof(*ctx));
+    if (ctx != NULL)
+        base_ctx_init(provctx, &ctx->base_ctx,
+                      ossl_prov_cipher_hw_aes_cbc_hmac_sha1_etm(), kbits, blkbits,
+                      ivbits, flags);
+    return ctx;
+}
+
+static void aes_cbc_hmac_sha1_etm_freectx(void *vctx)
+{
+    PROV_AES_HMAC_SHA1_ETM_CTX *ctx = (PROV_AES_HMAC_SHA1_ETM_CTX *)vctx;
+
+    if (ctx != NULL) {
+        ossl_cipher_generic_reset_ctx((PROV_CIPHER_CTX *)vctx);
+        OPENSSL_clear_free(ctx, sizeof(*ctx));
+    }
+}
+
+static void *aes_cbc_hmac_sha1_etm_dupctx(void *provctx)
+{
+    PROV_AES_HMAC_SHA1_ETM_CTX *ctx = provctx;
+
+    if (ctx == NULL)
+        return NULL;
+
+    return OPENSSL_memdup(ctx, sizeof(*ctx));
+}
+
+static void *aes_cbc_hmac_sha256_etm_newctx(void *provctx, size_t kbits,
+                                            size_t blkbits, size_t ivbits,
+                                            uint64_t flags)
+{
+    PROV_AES_HMAC_SHA256_ETM_CTX *ctx;
+
+    if (!ossl_prov_is_running())
+        return NULL;
+
+    ctx = OPENSSL_zalloc(sizeof(*ctx));
+    if (ctx != NULL)
+        base_ctx_init(provctx, &ctx->base_ctx,
+                      ossl_prov_cipher_hw_aes_cbc_hmac_sha256_etm(), kbits, blkbits,
+                      ivbits, flags);
+    return ctx;
+}
+
+static void aes_cbc_hmac_sha256_etm_freectx(void *vctx)
+{
+    PROV_AES_HMAC_SHA256_ETM_CTX *ctx = (PROV_AES_HMAC_SHA256_ETM_CTX *)vctx;
+
+    if (ctx != NULL) {
+        ossl_cipher_generic_reset_ctx((PROV_CIPHER_CTX *)vctx);
+        OPENSSL_clear_free(ctx, sizeof(*ctx));
+    }
+}
+
+static void *aes_cbc_hmac_sha256_etm_dupctx(void *provctx)
+{
+    PROV_AES_HMAC_SHA256_ETM_CTX *ctx = provctx;
+
+    if (ctx == NULL)
+        return NULL;
+
+    return OPENSSL_memdup(ctx, sizeof(*ctx));
+}
+
+# define IMPLEMENT_CIPHER(nm, sub, kbits, blkbits, ivbits, flags)              \
+static OSSL_FUNC_cipher_newctx_fn nm##_##kbits##_##sub##_newctx;               \
+static void *nm##_##kbits##_##sub##_newctx(void *provctx)                      \
+{                                                                              \
+    return nm##_##sub##_newctx(provctx, kbits, blkbits, ivbits, flags);        \
+}                                                                              \
+static OSSL_FUNC_cipher_get_params_fn nm##_##kbits##_##sub##_get_params;       \
+static int nm##_##kbits##_##sub##_get_params(OSSL_PARAM params[])              \
+{                                                                              \
+    return ossl_cipher_generic_get_params(params, EVP_CIPH_CBC_MODE,           \
+                                          flags, kbits, blkbits, ivbits);      \
+}                                                                              \
+const OSSL_DISPATCH ossl_##nm##kbits##sub##_functions[] = {                    \
+    { OSSL_FUNC_CIPHER_NEWCTX, (void (*)(void))nm##_##kbits##_##sub##_newctx },\
+    { OSSL_FUNC_CIPHER_FREECTX, (void (*)(void))nm##_##sub##_freectx },        \
+    { OSSL_FUNC_CIPHER_DUPCTX,  (void (*)(void))nm##_##sub##_dupctx},          \
+    { OSSL_FUNC_CIPHER_ENCRYPT_INIT, (void (*)(void))nm##_einit },             \
+    { OSSL_FUNC_CIPHER_DECRYPT_INIT, (void (*)(void))nm##_dinit },             \
+    { OSSL_FUNC_CIPHER_UPDATE, (void (*)(void))nm##_update },                  \
+    { OSSL_FUNC_CIPHER_FINAL, (void (*)(void))nm##_final },                    \
+    { OSSL_FUNC_CIPHER_CIPHER, (void (*)(void))nm##_cipher },                  \
+    { OSSL_FUNC_CIPHER_GET_PARAMS,                                             \
+        (void (*)(void))nm##_##kbits##_##sub##_get_params },                   \
+    { OSSL_FUNC_CIPHER_GETTABLE_PARAMS,                                        \
+        (void (*)(void))nm##_gettable_params },                                \
+    { OSSL_FUNC_CIPHER_GET_CTX_PARAMS,                                         \
+         (void (*)(void))nm##_get_ctx_params },                                \
+    { OSSL_FUNC_CIPHER_GETTABLE_CTX_PARAMS,                                    \
+        (void (*)(void))nm##_gettable_ctx_params },                            \
+    { OSSL_FUNC_CIPHER_SET_CTX_PARAMS,                                         \
+        (void (*)(void))nm##_set_ctx_params },                                 \
+    { OSSL_FUNC_CIPHER_SETTABLE_CTX_PARAMS,                                    \
+        (void (*)(void))nm##_settable_ctx_params },                            \
+    OSSL_DISPATCH_END                                                          \
+};
+#endif /* AES_CBC_HMAC_SHA_ETM_CAPABLE */
+
+/* ossl_aes128cbc_hmac_sha1_etm_functions */
+IMPLEMENT_CIPHER(aes, cbc_hmac_sha1_etm, 128, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC)
+/* ossl_aes192cbc_hmac_sha1_etm_functions */
+IMPLEMENT_CIPHER(aes, cbc_hmac_sha1_etm, 192, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC)
+/* ossl_aes256cbc_hmac_sha1_etm_functions */
+IMPLEMENT_CIPHER(aes, cbc_hmac_sha1_etm, 256, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC)
+/* ossl_aes128cbc_hmac_sha256_etm_functions */
+IMPLEMENT_CIPHER(aes, cbc_hmac_sha256_etm, 128, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC)
+/* ossl_aes192cbc_hmac_sha256_etm_functions */
+IMPLEMENT_CIPHER(aes, cbc_hmac_sha256_etm, 192, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC)
+/* ossl_aes256cbc_hmac_sha256_etm_functions */
+IMPLEMENT_CIPHER(aes, cbc_hmac_sha256_etm, 256, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC)
\ No newline at end of file
diff --git a/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha_etm.h b/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha_etm.h
new file mode 100644 (file)
index 0000000..c8b2b1e
--- /dev/null
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2024 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/proverr.h>
+#include "prov/ciphercommon.h"
+#include "crypto/aes_platform.h"
+
+int ossl_cipher_capable_aes_cbc_hmac_sha1_etm(void);
+int ossl_cipher_capable_aes_cbc_hmac_sha256_etm(void);
+
+typedef struct prov_cipher_hw_aes_hmac_sha_ctx_etm_st {
+    PROV_CIPHER_HW base; /* must be first */
+    void (*init_mac_key)(void *ctx, const unsigned char *inkey, size_t inlen);
+} PROV_CIPHER_HW_AES_HMAC_SHA_ETM;
+
+const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *ossl_prov_cipher_hw_aes_cbc_hmac_sha1_etm(void);
+const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *ossl_prov_cipher_hw_aes_cbc_hmac_sha256_etm(void);
+
+#ifdef AES_CBC_HMAC_SHA_ETM_CAPABLE
+# include <openssl/aes.h>
+# include <openssl/sha.h>
+
+# define AES_CBC_MAX_HMAC_SIZE 32
+
+typedef struct prov_aes_hmac_sha_etm_ctx_st {
+    PROV_CIPHER_CTX base;
+    AES_KEY ks;
+    const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *hw;
+    unsigned char tag[AES_CBC_MAX_HMAC_SIZE];
+    unsigned char exp_tag[AES_CBC_MAX_HMAC_SIZE];
+    size_t taglen;
+} PROV_AES_HMAC_SHA_ETM_CTX;
+
+typedef struct prov_aes_hmac_sha1_etm_ctx_st {
+    PROV_AES_HMAC_SHA_ETM_CTX base_ctx;
+    SHA_CTX head, tail;
+} PROV_AES_HMAC_SHA1_ETM_CTX;
+
+typedef struct prov_aes_hmac_sha256_etm_ctx_st {
+    PROV_AES_HMAC_SHA_ETM_CTX base_ctx;
+    SHA256_CTX head, tail;
+} PROV_AES_HMAC_SHA256_ETM_CTX;
+
+typedef struct {
+    struct {
+        uint8_t *key;
+        uint8_t key_rounds;
+        uint8_t *iv;
+    } cipher;
+    struct {
+        struct {
+            uint8_t *i_key_pad;
+            uint8_t *o_key_pad;
+        } hmac;
+    } digest;
+} CIPH_DIGEST;
+
+#endif /* AES_CBC_HMAC_SHA_ETM_CAPABLE */
index b1331b5b64f1ea6e2652b3e210bb5d66fb74f240..3b09506447f21e6a75a4edbfc361e61b564c8c73 100644 (file)
@@ -81,6 +81,12 @@ int ossl_cipher_generic_get_params(OSSL_PARAM params[], unsigned int md,
         ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER);
         return 0;
     }
+    p = OSSL_PARAM_locate(params, OSSL_CIPHER_PARAM_ENCRYPT_THEN_MAC);
+    if (p != NULL
+        && !OSSL_PARAM_set_int(p, (flags & EVP_CIPH_FLAG_ENC_THEN_MAC) != 0)) {
+        ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER);
+        return 0;
+    }
     p = OSSL_PARAM_locate(params, OSSL_CIPHER_PARAM_KEYLEN);
     if (p != NULL && !OSSL_PARAM_set_size_t(p, kbits / 8)) {
         ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER);
index 35b0b0b974064825a95d1f07c8392f6945068787..2b770badc691d25e78d83b36188a4073ec445ee5 100644 (file)
@@ -98,6 +98,12 @@ extern const OSSL_DISPATCH ossl_aes256cbc_hmac_sha1_functions[];
 extern const OSSL_DISPATCH ossl_aes128cbc_hmac_sha1_functions[];
 extern const OSSL_DISPATCH ossl_aes256cbc_hmac_sha256_functions[];
 extern const OSSL_DISPATCH ossl_aes128cbc_hmac_sha256_functions[];
+extern const OSSL_DISPATCH ossl_aes128cbc_hmac_sha1_etm_functions[];
+extern const OSSL_DISPATCH ossl_aes192cbc_hmac_sha1_etm_functions[];
+extern const OSSL_DISPATCH ossl_aes256cbc_hmac_sha1_etm_functions[];
+extern const OSSL_DISPATCH ossl_aes128cbc_hmac_sha256_etm_functions[];
+extern const OSSL_DISPATCH ossl_aes192cbc_hmac_sha256_etm_functions[];
+extern const OSSL_DISPATCH ossl_aes256cbc_hmac_sha256_etm_functions[];
 
 #ifndef OPENSSL_NO_ARIA
 extern const OSSL_DISPATCH ossl_aria256gcm_functions[];
index 3b747ec92c0489b202bb08b6f12247d20477589b..19fdf635c07ad2496aa11ab69947df58f9a4abd1 100644 (file)
 #define PROV_NAMES_DES_CFB "DES-CFB:1.3.14.3.2.9"
 #define PROV_NAMES_DES_CFB1 "DES-CFB1"
 #define PROV_NAMES_DES_CFB8 "DES-CFB8"
+#define PROV_NAMES_AES_128_CBC_HMAC_SHA1_ETM "AES-128-CBC-HMAC-SHA1-ETM"
+#define PROV_NAMES_AES_192_CBC_HMAC_SHA1_ETM "AES-192-CBC-HMAC-SHA1-ETM"
+#define PROV_NAMES_AES_256_CBC_HMAC_SHA1_ETM "AES-256-CBC-HMAC-SHA1-ETM"
+#define PROV_NAMES_AES_128_CBC_HMAC_SHA256_ETM "AES-128-CBC-HMAC-SHA256-ETM"
+#define PROV_NAMES_AES_192_CBC_HMAC_SHA256_ETM "AES-192-CBC-HMAC-SHA256-ETM"
+#define PROV_NAMES_AES_256_CBC_HMAC_SHA256_ETM "AES-256-CBC-HMAC-SHA256-ETM"
 
 /*-
  * Digests
index 039fca9bb09f814b6dc7609d1d5478f9ce9cc4dd..ae03e589a03e4401a45f9698263d37cca88a9dcf 100644 (file)
@@ -487,7 +487,8 @@ static int test_cipher_reinit_partialupdate(int test_id)
 
     /* skip any ciphers that don't allow partial updates */
     if (((EVP_CIPHER_get_flags(cipher)
-          & (EVP_CIPH_FLAG_CTS | EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK)) != 0)
+          & (EVP_CIPH_FLAG_CTS | EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK |
+          EVP_CIPH_FLAG_ENC_THEN_MAC)) != 0)
         || EVP_CIPHER_get_mode(cipher) == EVP_CIPH_CCM_MODE
         || EVP_CIPHER_get_mode(cipher) == EVP_CIPH_XTS_MODE
         || EVP_CIPHER_get_mode(cipher) == EVP_CIPH_WRAP_MODE) {
index e34ea1d96e6769a846aca49fc3e3892c8000d03f..f45b7d816621d755d1ac14c6e633ce3da902aedc 100644 (file)
@@ -918,7 +918,8 @@ static int cipher_test_valid_fragmentation(CIPHER_DATA *cdat)
             || EVP_CIPHER_get_mode(cdat->cipher) == EVP_CIPH_SIV_MODE
             || EVP_CIPHER_get_mode(cdat->cipher) == EVP_CIPH_GCM_SIV_MODE
             || EVP_CIPHER_get_mode(cdat->cipher) == EVP_CIPH_XTS_MODE
-            || EVP_CIPHER_get_mode(cdat->cipher) == EVP_CIPH_WRAP_MODE) ? 0 : 1;
+            || EVP_CIPHER_get_mode(cdat->cipher) == EVP_CIPH_WRAP_MODE
+            || EVP_CIPHER_get_mode(cdat->cipher) == EVP_CIPH_CBC_MODE) ? 0 : 1;
 }
 
 static int cipher_test_init(EVP_TEST *t, const char *alg)
@@ -1025,6 +1026,10 @@ static int cipher_test_parse(EVP_TEST *t, const char *keyword,
         cdat->key_bits = (size_t)i;
         return 1;
     }
+    if (strcmp(keyword, "Tag") == 0)
+        return parse_bin(value, &cdat->tag, &cdat->tag_len);
+    if (strcmp(keyword, "MACKey") == 0)
+        return parse_bin(value, &cdat->mac_key, &cdat->mac_key_len);
     if (cdat->aead) {
         int tls_aad = 0;
 
@@ -1037,8 +1042,6 @@ static int cipher_test_parse(EVP_TEST *t, const char *keyword,
             }
             return -1;
         }
-        if (strcmp(keyword, "Tag") == 0)
-            return parse_bin(value, &cdat->tag, &cdat->tag_len);
         if (strcmp(keyword, "SetTagLate") == 0) {
             if (strcmp(value, "TRUE") == 0)
                 cdat->tag_late = 1;
@@ -1048,8 +1051,6 @@ static int cipher_test_parse(EVP_TEST *t, const char *keyword,
                 return -1;
             return 1;
         }
-        if (strcmp(keyword, "MACKey") == 0)
-            return parse_bin(value, &cdat->mac_key, &cdat->mac_key_len);
         if (strcmp(keyword, "TLSVersion") == 0) {
             char *endptr;
 
@@ -1349,6 +1350,12 @@ static int cipher_test_enc(EVP_TEST *t, int enc, size_t out_misalign,
             t->err = "TAG_SET_ERROR";
             goto err;
         }
+    } else if (!enc && expected->mac_key && expected->tag) {
+       if (EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_AEAD_SET_TAG,
+                                 expected->tag_len, expected->tag) <= 0) {
+            t->err = "TAG_SET_ERROR";
+            goto err;
+        }
     }
     if (expected->xts_standard != NULL) {
         OSSL_PARAM params[2];
@@ -1443,6 +1450,48 @@ static int cipher_test_enc(EVP_TEST *t, int enc, size_t out_misalign,
                                 rtag, expected->tag_len))
             goto err;
     }
+    if (enc && expected->tag) {
+        if (EVP_CIPHER_is_a(expected->cipher, "AES-128-CBC-HMAC-SHA1-ETM")
+            || EVP_CIPHER_is_a(expected->cipher, "AES-128-CBC-HMAC-SHA256-ETM")
+            || EVP_CIPHER_is_a(expected->cipher, "AES-192-CBC-HMAC-SHA1-ETM")
+            || EVP_CIPHER_is_a(expected->cipher, "AES-192-CBC-HMAC-SHA256-ETM")
+            || EVP_CIPHER_is_a(expected->cipher, "AES-256-CBC-HMAC-SHA1-ETM")
+            || EVP_CIPHER_is_a(expected->cipher, "AES-256-CBC-HMAC-SHA256-ETM")) {
+            unsigned char rtag[32] = {0};
+            unsigned tag_len = 0;
+            OSSL_PARAM params[2];
+
+            if (EVP_CIPHER_is_a(expected->cipher, "AES-128-CBC-HMAC-SHA1-ETM")
+                || EVP_CIPHER_is_a(expected->cipher, "AES-192-CBC-HMAC-SHA1-ETM")
+                || EVP_CIPHER_is_a(expected->cipher, "AES-256-CBC-HMAC-SHA1-ETM")) {
+                tag_len = 20;
+            } else if (EVP_CIPHER_is_a(expected->cipher, "AES-128-CBC-HMAC-SHA256-ETM")
+                       || EVP_CIPHER_is_a(expected->cipher, "AES-192-CBC-HMAC-SHA256-ETM")
+                       || EVP_CIPHER_is_a(expected->cipher, "AES-256-CBC-HMAC-SHA256-ETM")) {
+                tag_len = 32;
+            }
+
+            if (!TEST_size_t_le(expected->tag_len, tag_len)) {
+                t->err = "TAG_LENGTH_INTERNAL_ERROR";
+                goto err;
+            }
+
+            params[0] = OSSL_PARAM_construct_octet_string(OSSL_CIPHER_HMAC_PARAM_MAC,
+                                                          &rtag[0],
+                                                          tag_len);
+            params[1] = OSSL_PARAM_construct_end();
+
+            if (!EVP_CIPHER_CTX_get_params(ctx, params)) {
+                t->err = "TAG_RETRIEVE_ERROR";
+                goto err;
+            }
+
+            if (!memory_err_compare(t, "TAG_VALUE_MISMATCH",
+                                    expected->tag, expected->tag_len,
+                                    rtag, expected->tag_len))
+                goto err;
+        }
+    }
     /* Check the updated IV */
     if (expected->next_iv != NULL) {
         /* Some (e.g., GCM) tests use IVs longer than EVP_MAX_IV_LENGTH. */
index ef2d1a27f98ef55ad9199c35b1ab20d35baf29d5..06da481bb5b80e8453fa643df682bc67370ddbfe 100644 (file)
@@ -124,3 +124,108 @@ TLSVersion = 0x0302
 Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f
 Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0ce83c1ff5f2680dc57ffd8eefdbb9c00d3d507672d105a990b2b78509978625b9d93c2bd41e3fb721abd1496553c583c67dad9b662b3d58c8540e10ed9c5ed1a7f33ce9e9a41c30836651d73ee2c003af03a919eb41a6d70ef814e184e740f8a96221b924d9d025ef5e7150d4ca76921a025dd146fef87da738877313f11ec8f4c558b878c28ce6a9a5011d70f58c5dbd3412cf0a32154f5a4286958a5a50a86f15119835ceccf432601e4cc688cdd682ac9620500b60c0760bb93209859823778a7f2b5bab1af259bda13d84f952af9d2f07f500dadedc41a2b6a737a1296e0b2fb96ac4da4bf71fe2f0c4a1b6fc4dd251087e4c03d2e28c85a9b4a835ef166b48e5b7690f332a1d8db7bd9380221891f31ee82f4b8dd9ebf540cab583a0f33
 NextIV = 1f31ee82f4b8dd9ebf540cab583a0f33
+
+Title = AES-128-CBC-HMAC-SHA1-ETM test vectors
+
+Cipher = AES-128-CBC-HMAC-SHA1-ETM
+Key = feffe9928665731c6d6a8f9467308308
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f
+Ciphertext = 18bd54842828fdc0ac5a3b459f32f0be
+Tag = 23f889888b834208235ad034ec087674f5d80a4a
+
+Cipher = AES-128-CBC-HMAC-SHA1-ETM
+Key = feffe9928665731c6d6a8f9467308308
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f
+Ciphertext = 18bd54842828fdc0ac5a3b459f32f0be305a77944b17f62fedd4442ae60a0b0a3e1c2c23c584c86877fbd9997b415959254ea06ef046dc2e1fdafe7950a77ba94494683e01a0c495dc223a2de73be1474bcdf0b104a89ca6d419254e8f602334158d188f748c5cf4b7473c7475b4cf6c
+Tag = 1b001f67b5438782bffb7febbca4ef4cca9f56ab
+
+Cipher = AES-128-CBC-HMAC-SHA1-ETM
+Key = feffe9928665731c6d6a8f9467308308
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f
+Ciphertext = 18bd54842828fdc0ac5a3b459f32f0be305a77944b17f62fedd4442ae60a0b0a3e1c2c23c584c86877fbd9997b415959254ea06ef046dc2e1fdafe7950a77ba94494683e01a0c495dc223a2de73be1474bcdf0b104a89ca6d419254e8f602334158d188f748c5cf4b7473c7475b4cf6c3bfadb50a6126c4fe31d52606b97f347a9d6722a458cc2afdd895c3a247d11e551398180bc445b0ea94d17a1a441fb10b86d84a7549e03b6edf1a12591c63dfa167f2f11ea12b2d3d8f62d92be9238d1e6eed2099f3d0f9e1fe541618bbda588899002c3078202a2d138942c4325b673e494b310a502cda70e8f62480776c31068cb3d2f4c250b9e65669d950b1a4d50cf5f2b11c74960347885e8dbb89d58f24871c34f1a134b1873222b24a310f8bb3299ca1d16cb1921c97fb462e3150b57909ec7d376e93e52ea9e51094f22f11273c32403c82acebf575b7b7af7c98976adf6f4bd4199bd9201fa7321aaad828bfcc3785776f959484ff013d8a66d579af036a6c0e82d94e6eb773f6124f18da5ca4cf5b70f72e9d852766af78269d36a03eb2e2cdda79f16c0f81be27b6593c3f4e9d19cb7018a7e4ca74756dd66ac1b45a4d741e0431d120a7f84dbbc4d7d478b54464050e62d8da0c856ccbc2dcd4dec4aa4d554ac4cce8fbeca8ba4efb55a25771f425a6e5bd74c35972c3da41eeee7fb36b5075e5ab3115f7424f0dab05a085185e923d9ad3e74dc16ff2ecfe03afdf34ba17babafc65aa87600c632ccdcbcc1b591d723eb37a8a3f869cce9fe41
+Tag = 2824154e89a5867c46eebafb04ae3e3e4938f8bb
+
+Title = AES-256-CBC-HMAC-SHA1-ETM test vectors
+
+Cipher = AES-256-CBC-HMAC-SHA1-ETM
+Key = 6cc028952fa7c1ee09fc78b7549ae04d79b54d40ec172333e3a4a2297b62afe5
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f
+Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0
+Tag = f7bce8a7f8a307b736f6fabb194fc29ceed3e0df
+
+Cipher = AES-256-CBC-HMAC-SHA1-ETM
+Key = 6cc028952fa7c1ee09fc78b7549ae04d79b54d40ec172333e3a4a2297b62afe5
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f
+Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0ce83c1ff5f2680dc57ffd8eefdbb9c00d3d507672d105a990b2b78509978625b9d93c2bd41e3fb721abd1496553c583c67dad9b662b3d58c8540e10ed9c5ed1a7f33ce9e9a41c30836651d73ee2c003af03a919eb41a6d70ef814e184e740f8a
+Tag = b86074c76249f1f058674c514dd52225c9bee36e
+
+Cipher = AES-256-CBC-HMAC-SHA1-ETM
+Key = 6cc028952fa7c1ee09fc78b7549ae04d79b54d40ec172333e3a4a2297b62afe5
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f
+Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0ce83c1ff5f2680dc57ffd8eefdbb9c00d3d507672d105a990b2b78509978625b9d93c2bd41e3fb721abd1496553c583c67dad9b662b3d58c8540e10ed9c5ed1a7f33ce9e9a41c30836651d73ee2c003af03a919eb41a6d70ef814e184e740f8a4ca75016ae77ac335ba758396232a87ffceacf24a0e287371eaa04570cb68dcd61882e1c3f7aca38afed34138fedefe167bb9c741ebd14da2eba3cf5b9aa06bb93ca61fa462de7e1f439efac5ea55edab61171250be36da513e6b5f92c8267f778cdde5720128a586c7bbd5864686b12710daa9f133706e81fa3a066bd1f29277c08ca8f052b3ed06f04ec2a8509f54934fd9b06f4115e546011ff485ac76d5fce0329c94bf5f29726bed49ace94abf53b036c1f920f8c71d44deca7b11f653025698425717bb3cc8f5e74230d8ede675ee0eae6f8aae274152c7503c567427a71323feb84b0fc0515030c933e4c7399be13322b5d4ccabb97c011d75de82f38a540e972bc2a515dc31d50e78b74be891cc4a2ddbe4b50d0d27c069985a581b80a9f591a4bb198f085af2138ca9b4f595c37d60f15d960b1e39de7ff92a699d9aca4a44ff9d327c7130e6b0ce90032e358f3743d8abccaeb0426226d6ec233fdf289bdde5f3b2756a587a382e3353d77acb9774bd64978629633f2122d1fa376b12cfbe4781d6a35227d71fdfa929c1435596fbaf7fe0aea4fa02c6b9e8099c62149ed82819a2088b72660be8ea364c13d5340be93cab8ac92914d2b1115cbb7
+Tag = 8824b6ed9be82651706c292047c08269fd6c943b
+
+Title = AES-128-CBC-HMAC-SHA256-ETM test vectors
+
+Cipher = AES-128-CBC-HMAC-SHA256-ETM
+Key = feffe9928665731c6d6a8f9467308308
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f
+Ciphertext = 18bd54842828fdc0ac5a3b459f32f0be
+Tag = 3f1dd3b858ecc9d8beea6db830a1fe6f362b48909974d44fa0c9ef7d22e515e7
+
+Cipher = AES-128-CBC-HMAC-SHA256-ETM
+Key = feffe9928665731c6d6a8f9467308308
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f
+Ciphertext = 18bd54842828fdc0ac5a3b459f32f0be305a77944b17f62fedd4442ae60a0b0a3e1c2c23c584c86877fbd9997b415959254ea06ef046dc2e1fdafe7950a77ba94494683e01a0c495dc223a2de73be1474bcdf0b104a89ca6d419254e8f602334158d188f748c5cf4b7473c7475b4cf6c
+Tag = 4cd35de98355fda7334262a3d9eb26d0cd4c4d9b2c58b5a107bd8728da18e6bb
+
+Cipher = AES-128-CBC-HMAC-SHA256-ETM
+Key = feffe9928665731c6d6a8f9467308308
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f
+Ciphertext = 18bd54842828fdc0ac5a3b459f32f0be305a77944b17f62fedd4442ae60a0b0a3e1c2c23c584c86877fbd9997b415959254ea06ef046dc2e1fdafe7950a77ba94494683e01a0c495dc223a2de73be1474bcdf0b104a89ca6d419254e8f602334158d188f748c5cf4b7473c7475b4cf6c3bfadb50a6126c4fe31d52606b97f347a9d6722a458cc2afdd895c3a247d11e551398180bc445b0ea94d17a1a441fb10b86d84a7549e03b6edf1a12591c63dfa167f2f11ea12b2d3d8f62d92be9238d1e6eed2099f3d0f9e1fe541618bbda588899002c3078202a2d138942c4325b673e494b310a502cda70e8f62480776c31068cb3d2f4c250b9e65669d950b1a4d50cf5f2b11c74960347885e8dbb89d58f24871c34f1a134b1873222b24a310f8bb3299ca1d16cb1921c97fb462e3150b57909ec7d376e93e52ea9e51094f22f11273c32403c82acebf575b7b7af7c98976adf6f4bd4199bd9201fa7321aaad828bfcc3785776f959484ff013d8a66d579af036a6c0e82d94e6eb773f6124f18da5ca4cf5b70f72e9d852766af78269d36a03eb2e2cdda79f16c0f81be27b6593c3f4e9d19cb7018a7e4ca74756dd66ac1b45a4d741e0431d120a7f84dbbc4d7d478b54464050e62d8da0c856ccbc2dcd4dec4aa4d554ac4cce8fbeca8ba4efb55a25771f425a6e5bd74c35972c3da41eeee7fb36b5075e5ab3115f7424f0dab05a085185e923d9ad3e74dc16ff2ecfe03afdf34ba17babafc65aa87600c632ccdcbcc1b591d723eb37a8a3f869cce9fe41
+Tag = e23b2b23e56698fcbe2bde48035a863bb73a58e9e12d7e0de2fda0f82ff87676
+
+Title = AES-256-CBC-HMAC-SHA256-ETM test vectors
+
+Cipher = AES-256-CBC-HMAC-SHA256-ETM
+Key = 6cc028952fa7c1ee09fc78b7549ae04d79b54d40ec172333e3a4a2297b62afe5
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f
+Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0
+Tag = 38bc6d7930f516a29b17ede8388d42faa612b163021028b0d86b08c3b87cd31e
+
+Cipher = AES-256-CBC-HMAC-SHA256-ETM
+Key = 6cc028952fa7c1ee09fc78b7549ae04d79b54d40ec172333e3a4a2297b62afe5
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f
+Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0ce83c1ff5f2680dc57ffd8eefdbb9c00d3d507672d105a990b2b78509978625b9d93c2bd41e3fb721abd1496553c583c67dad9b662b3d58c8540e10ed9c5ed1a7f33ce9e9a41c30836651d73ee2c003af03a919eb41a6d70ef814e184e740f8a
+Tag = b3e7c334cf7906f5dfe8e20bb6a332578dd8e7fb688a7dcd299c1caba3fefbe5
+
+Cipher = AES-256-CBC-HMAC-SHA256-ETM
+Key = 6cc028952fa7c1ee09fc78b7549ae04d79b54d40ec172333e3a4a2297b62afe5
+MACKey = cafebabefacedbaddecaf88801020304
+IV = 101112131415161718191a1b1c1d1e1f
+Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f
+Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0ce83c1ff5f2680dc57ffd8eefdbb9c00d3d507672d105a990b2b78509978625b9d93c2bd41e3fb721abd1496553c583c67dad9b662b3d58c8540e10ed9c5ed1a7f33ce9e9a41c30836651d73ee2c003af03a919eb41a6d70ef814e184e740f8a4ca75016ae77ac335ba758396232a87ffceacf24a0e287371eaa04570cb68dcd61882e1c3f7aca38afed34138fedefe167bb9c741ebd14da2eba3cf5b9aa06bb93ca61fa462de7e1f439efac5ea55edab61171250be36da513e6b5f92c8267f778cdde5720128a586c7bbd5864686b12710daa9f133706e81fa3a066bd1f29277c08ca8f052b3ed06f04ec2a8509f54934fd9b06f4115e546011ff485ac76d5fce0329c94bf5f29726bed49ace94abf53b036c1f920f8c71d44deca7b11f653025698425717bb3cc8f5e74230d8ede675ee0eae6f8aae274152c7503c567427a71323feb84b0fc0515030c933e4c7399be13322b5d4ccabb97c011d75de82f38a540e972bc2a515dc31d50e78b74be891cc4a2ddbe4b50d0d27c069985a581b80a9f591a4bb198f085af2138ca9b4f595c37d60f15d960b1e39de7ff92a699d9aca4a44ff9d327c7130e6b0ce90032e358f3743d8abccaeb0426226d6ec233fdf289bdde5f3b2756a587a382e3353d77acb9774bd64978629633f2122d1fa376b12cfbe4781d6a35227d71fdfa929c1435596fbaf7fe0aea4fa02c6b9e8099c62149ed82819a2088b72660be8ea364c13d5340be93cab8ac92914d2b1115cbb7
+Tag = 8cb8898a5b559984da3cbaa4703c9ed3cfc2f56c7292a3279a3dd5f7475412e1
+
index 059b48973562be5b05f8ade5d2ec701d9da02d4d..f9acf03764027e3d35554eae112b7110049dfcc4 100644 (file)
@@ -150,6 +150,8 @@ my %params = (
     'CIPHER_PARAM_ALGORITHM_ID_PARAMS' =>  '*ALG_PARAM_ALGORITHM_ID_PARAMS',
     'CIPHER_PARAM_ALGORITHM_ID_PARAMS_OLD' => "alg_id_param", # octet_string
     'CIPHER_PARAM_XTS_STANDARD' =>         "xts_standard",# utf8_string
+    'CIPHER_PARAM_ENCRYPT_THEN_MAC' =>     "encrypt-then-mac",# int, 0 or 1
+    'CIPHER_HMAC_PARAM_MAC' =>             "*CIPHER_PARAM_AEAD_TAG",
 
     'CIPHER_PARAM_TLS1_MULTIBLOCK_MAX_SEND_FRAGMENT' =>  "tls1multi_maxsndfrag",# uint
     'CIPHER_PARAM_TLS1_MULTIBLOCK_MAX_BUFSIZE' =>        "tls1multi_maxbufsz",  # size_t