From 86408fa8de640ebf09b08cb5fce8173d2dbc5702 Mon Sep 17 00:00:00 2001 From: "fangming.fang" Date: Wed, 17 Jan 2024 10:48:55 +0000 Subject: [PATCH] Implement interleaving aes-cbc-hmac-sha on aarch64 This is to implement #19932, it adds enc-then-mac aes-cbc-hmac-sha1/256, aes-cbc and hmac-sha1/256 are interleaved to achieve better performance. It only supports non-padding mode that means the length of input data should be multiple of 16 bytes. Reviewed-by: Tomas Mraz Reviewed-by: Tom Cosgrove (Merged from https://github.com/openssl/openssl/pull/22949) --- apps/enc.c | 1 + apps/lib/opt.c | 2 + crypto/aes/asm/aes-sha1-armv8.pl | 4312 +++++++++++++++ crypto/aes/asm/aes-sha256-armv8.pl | 4631 +++++++++++++++++ crypto/aes/build.info | 8 +- crypto/evp/evp_lib.c | 9 +- crypto/objects/obj_dat.h | 24 +- crypto/objects/obj_mac.num | 6 + crypto/objects/objects.txt | 6 + include/crypto/aes_platform.h | 11 +- include/openssl/evp.h | 1 + include/openssl/obj_mac.h | 24 + .../common/include/prov/providercommon.h | 2 + providers/defltprov.c | 12 + providers/fips/fipsprov.c | 12 + providers/implementations/ciphers/build.info | 3 + .../ciphers/cipher_aes_cbc_hmac_sha1_etm_hw.c | 179 + .../cipher_aes_cbc_hmac_sha256_etm_hw.c | 179 + .../ciphers/cipher_aes_cbc_hmac_sha_etm.c | 310 ++ .../ciphers/cipher_aes_cbc_hmac_sha_etm.h | 64 + .../implementations/ciphers/ciphercommon.c | 6 + .../include/prov/implementations.h | 6 + .../implementations/include/prov/names.h | 6 + test/evp_libctx_test.c | 3 +- test/evp_test.c | 59 +- .../30-test_evp_data/evpciph_aes_stitched.txt | 105 + util/perl/OpenSSL/paramnames.pm | 2 + 27 files changed, 9969 insertions(+), 14 deletions(-) create mode 100644 crypto/aes/asm/aes-sha1-armv8.pl create mode 100644 crypto/aes/asm/aes-sha256-armv8.pl create mode 100644 providers/implementations/ciphers/cipher_aes_cbc_hmac_sha1_etm_hw.c create mode 100644 providers/implementations/ciphers/cipher_aes_cbc_hmac_sha256_etm_hw.c create mode 100644 providers/implementations/ciphers/cipher_aes_cbc_hmac_sha_etm.c create mode 100644 providers/implementations/ciphers/cipher_aes_cbc_hmac_sha_etm.h diff --git a/apps/enc.c b/apps/enc.c index 3f45ba15e57..cf26ac0a7d5 100644 --- a/apps/enc.c +++ b/apps/enc.c @@ -807,6 +807,7 @@ static void show_ciphers(const OBJ_NAME *name, void *arg) cipher = EVP_get_cipherbyname(name->name); if (cipher == NULL || (EVP_CIPHER_get_flags(cipher) & EVP_CIPH_FLAG_AEAD_CIPHER) != 0 + || (EVP_CIPHER_get_flags(cipher) & EVP_CIPH_FLAG_ENC_THEN_MAC) != 0 || EVP_CIPHER_get_mode(cipher) == EVP_CIPH_XTS_MODE) return; diff --git a/apps/lib/opt.c b/apps/lib/opt.c index 0018c268c0b..39276d828c8 100644 --- a/apps/lib/opt.c +++ b/apps/lib/opt.c @@ -437,6 +437,8 @@ int opt_cipher(const char *name, EVP_CIPHER **cipherp) opt_printf_stderr("%s XTS ciphers not supported\n", prog); } else if ((flags & EVP_CIPH_FLAG_AEAD_CIPHER) != 0) { opt_printf_stderr("%s: AEAD ciphers not supported\n", prog); + } else if ((flags & EVP_CIPH_FLAG_ENC_THEN_MAC) != 0) { + opt_printf_stderr("%s: ENC-then-MAC cipher not supported\n", prog); } else { ret = 1; if (cipherp != NULL) diff --git a/crypto/aes/asm/aes-sha1-armv8.pl b/crypto/aes/asm/aes-sha1-armv8.pl new file mode 100644 index 00000000000..8f8505d1771 --- /dev/null +++ b/crypto/aes/asm/aes-sha1-armv8.pl @@ -0,0 +1,4312 @@ +#! /usr/bin/env perl + +# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +# Copyright (C) Cavium networks Ltd. 2016. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +#======================================================================== +# Derived from following files in +# https://github.com/ARM-software/AArch64cryptolib +# AArch64cryptolib_opt_big/aes_cbc_sha1/aes128cbc_sha1_hmac.S +# AArch64cryptolib_opt_big/aes_cbc_sha1/sha1_hmac_aes128cbc_dec.S +#======================================================================== + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +$code=<<___; +#include "arm_arch.h" + +# Theses are offsets into the CIPH_DIGEST struct +#define CIPHER_KEY 0 +#define CIPHER_KEY_ROUNDS 8 +#define CIPHER_IV 16 +#define HMAC_IKEYPAD 24 +#define HMAC_OKEYPAD 32 + +.text +.arch armv8-a+crypto +___ + +sub aes192_aes256_handle () { + my $compare = shift; + my $label = shift; + my $i = shift; + my $load_rk10 = shift; + + if($compare == 1) { +$code.=<<___; + cmp x16,#12 +___ + } +$code.=<<___; + b.lt .Laes128_${label}_$i +.Laes192_${label}_$i: + ldp q30,q31,[x17],32 /* rk[10],rk[11] */ + aese v$i.16b,v17.16b + aesmc v$i.16b,v$i.16b + aese v$i.16b,v30.16b + aesmc v$i.16b,v$i.16b + b.gt .Laes256_${label}_$i + ld1 {v30.16b},[x17] /* rk[12] */ + aese v$i.16b,v31.16b + eor v$i.16b,v$i.16b,v30.16b + sub x17, x17, #32 /* rewind x17 */ + b 1f +.Laes256_${label}_$i: + aese v$i.16b,v31.16b + aesmc v$i.16b,v$i.16b + ldp q30,q31,[x17],32 /* rk[12],rk[13] */ + aese v$i.16b,v30.16b + aesmc v$i.16b,v$i.16b + ld1 {v30.16b},[x17] /* rk[14] */ + aese v$i.16b,v31.16b + eor v$i.16b,v$i.16b,v30.16b + sub x17, x17, #64 /* rewind x17 */ + b 1f +.Laes128_${label}_$i: +___ + if ($load_rk10 == 1) { +$code.=<<___; + ld1 {v18.16b},[x9] +___ + } +$code.=<<___; + aese v$i.16b,v17.16b + eor v$i.16b,v$i.16b,v18.16b /* res 0 */ +1: +___ +} + +sub aes192_aes256_dec_handle () { + my $compare = shift; + my $label = shift; + my $i = shift; + my $load_rk10 = shift; + + if($compare == 1) { +$code.=<<___; + cmp x16,#12 +___ + } +$code.=<<___; + b.lt .Laes128_${label}_$i +.Laes192_${label}_$i: + stp q19,q23,[sp, #-32]! + ld1 {v19.16b},[x17],16 /* rk[10] */ + ld1 {v23.16b},[x17],16 /* rk[11] */ + aesd v$i.16b,v17.16b + aesimc v$i.16b,v$i.16b + aesd v$i.16b,v19.16b + aesimc v$i.16b,v$i.16b + b.gt .Laes256_${label}_$i + ld1 {v19.16b},[x17] /* rk[12] */ + aesd v$i.16b,v23.16b + eor v$i.16b,v$i.16b,v19.16b + sub x17, x17, #32 /* rewind x17 */ + ldp q19,q23,[sp], #32 + b 1f +.Laes256_${label}_$i: + aesd v$i.16b,v23.16b + aesimc v$i.16b,v$i.16b + ld1 {v19.16b},[x17],16 /* rk[12] */ + ld1 {v23.16b},[x17],16 /* rk[13] */ + aesd v$i.16b,v19.16b + aesimc v$i.16b,v$i.16b + ld1 {v19.16b},[x17] /* rk[14] */ + aesd v$i.16b,v23.16b + eor v$i.16b,v$i.16b,v19.16b + sub x17, x17, #64 /* rewind x17 */ + ldp q19,q23,[sp], #32 + b 1f +.Laes128_${label}_$i: +___ + if ($load_rk10 == 1) { +$code.=<<___; + ld1 {v18.16b},[x9] +___ + } +$code.=<<___; + aesd v$i.16b,v17.16b + eor v$i.16b,v$i.16b,v18.16b /* res 0 */ +1: +___ +} + +$code.=<<___; +# Description: +# +# Combined Enc/Auth Primitive = aes128cbc/sha1_hmac +# +# Operations: +# +# out = encrypt-AES128CBC(in) +# return_hash_ptr = SHA1(o_key_pad | SHA1(i_key_pad | out)) +# +# Prototype: +# int asm_aescbc_sha1_hmac(uint8_t *csrc, uint8_t *cdst, uint64_t clen, +# uint8_t *dsrc, uint8_t *ddst, uint64_t dlen, +# CIPH_DIGEST *arg) +# +# Registers used: +# +# asm_aescbc_sha1_hmac( +# csrc, x0 (cipher src address) +# cdst, x1 (cipher dst address) +# clen x2 (cipher length) +# dsrc, x3 (digest src address) +# ddst, x4 (digest dst address) +# dlen, x5 (digest length) +# arg x6: +# arg->cipher.key (round keys) +# arg->cipher.key_rounds (key rounds) +# arg->cipher.iv (initialization vector) +# arg->digest.hmac.i_key_pad (partially hashed i_key_pad) +# arg->digest.hmac.o_key_pad (partially hashed o_key_pad) +# ) +# +# Routine register definitions: +# +# v0 - v3 -- aes results +# v4 - v7 -- round consts for sha +# v8 - v18 -- round keys +# v19 -- temp register for SHA1 +# v20 -- ABCD copy (q20) +# v21 -- sha working state (q21) +# v22 -- sha working state (q22) +# v23 -- temp register for SHA1 +# v24 -- sha state ABCD +# v25 -- sha state E +# v26 -- sha block 0 +# v27 -- sha block 1 +# v28 -- sha block 2 +# v29 -- sha block 3 +# v30 -- reserved +# v31 -- reserved +# +# Constraints: +# +# The variable "clen" must be a multiple of 16, otherwise results are not +# defined. For AES partial blocks the user is required to pad the input +# to modulus 16 = 0. +# The variable "dlen" must be a multiple of 8 and greater or equal +# to "clen". This constraint is strictly related to the needs of the IPSec +# ESP packet. Encrypted payload is hashed along with the 8 byte ESP header, +# forming ICV. Speed gain is achieved by doing both things at the same time, +# hence lengths are required to match at least at the cipher level. +# +# Short lengths are not optimized at < 12 AES blocks + +.global asm_aescbc_sha1_hmac +.type asm_aescbc_sha1_hmac,%function + +.align 4 +.Lrcon: + .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999 + .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1 + .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc + .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6 + +asm_aescbc_sha1_hmac: + AARCH64_VALID_CALL_TARGET + /* protect registers */ + stp d8,d9,[sp,#-64]! + /* fetch args */ + ldr x7, [x6, #HMAC_IKEYPAD] + /* init ABCD, E */ + ldr q24, [x7] + eor v25.16b, v25.16b, v25.16b + ldr s25, [x7, #16] + /* save pointer to o_key_pad partial hash */ + ldr x7, [x6, #HMAC_OKEYPAD] + + stp d10,d11,[sp,#16] + + prfm PLDL1KEEP,[x0,0] /* pref next aes_ptr_in */ + prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */ + lsr x10,x2,4 /* aes_blocks = len/16 */ + + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + + ldr x9, [x6, #CIPHER_KEY] + ldr x16, [x6, #CIPHER_KEY_ROUNDS] + ldr x6, [x6, #CIPHER_IV] + add x17, x9, #160 /* point to the last 5 rounds keys */ + + /* + * init sha state, prefetch, check for small cases. + * Note that the output is prefetched as a load, for the in-place case + */ + cmp x10,12 /* no main loop if <12 */ + b.lt .Lenc_short_cases /* branch if < 12 */ + + /* proceed */ + ld1 {v3.16b},[x6] /* get 1st ivec */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0],16 + mov x11,x2 /* len -> x11 needed at end */ + lsr x12,x11,6 /* total_blocks */ + /* + * now we can do the loop prolog, 1st aes sequence of 4 blocks + */ + ldp q8,q9,[x9],32 /* rk[0],rk[1] */ + eor v0.16b,v0.16b,v3.16b /* xor w/ ivec (modeop) */ + + /* aes xform 0 */ + aese v0.16b,v8.16b + aesmc v0.16b,v0.16b + ldp q10,q11,[x9],32 /* rk[2],rk[3] */ + prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */ + /* base address for sha round consts */ + adr x8,.Lrcon + aese v0.16b,v9.16b + aesmc v0.16b,v0.16b + prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */ + ldp q12,q13,[x9],32 /* rk[4],rk[5] */ + aese v0.16b,v10.16b + aesmc v0.16b,v0.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x0],16 + aese v0.16b,v11.16b + aesmc v0.16b,v0.16b + ldp q14,q15,[x9],32 /* rk[6],rk[7] */ + aese v0.16b,v12.16b + aesmc v0.16b,v0.16b + aese v0.16b,v13.16b + aesmc v0.16b,v0.16b + ldp q16,q17,[x9],32 /* rk[8],rk[9] */ + aese v0.16b,v14.16b + aesmc v0.16b,v0.16b + aese v0.16b,v15.16b + aesmc v0.16b,v0.16b + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b +___ + &aes192_aes256_handle(1, "enc_prolog", 0, 1); +$code.=<<___; + eor v1.16b,v1.16b,v0.16b /* xor w/ ivec (modeop) */ + + /* aes xform 1 */ + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x0],16 + aese v1.16b,v8.16b + aesmc v1.16b,v1.16b + aese v1.16b,v9.16b + aesmc v1.16b,v1.16b + prfm PLDL1KEEP,[x8,0*64] /* rcon */ + aese v1.16b,v10.16b + aesmc v1.16b,v1.16b + aese v1.16b,v11.16b + aesmc v1.16b,v1.16b + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + ld1 {v26.16b},[x3],16 + prfm PLDL1KEEP,[x8,2*64] /* rcon */ + aese v1.16b,v12.16b + aesmc v1.16b,v1.16b + aese v1.16b,v13.16b + aesmc v1.16b,v1.16b + aese v1.16b,v14.16b + aesmc v1.16b,v1.16b + prfm PLDL1KEEP,[x8,4*64] /* rcon */ + aese v1.16b,v15.16b + aesmc v1.16b,v1.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + prfm PLDL1KEEP,[x8,6*64] /* rcon */ +___ + &aes192_aes256_handle(0, "enc_prolog", 1, 0); +$code.=<<___; + prfm PLDL1KEEP,[x8,8*64] /* rcon */ + eor v2.16b,v2.16b,v1.16b /* xor w/ivec (modeop) */ + + /* aes xform 2 */ + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x0],16 + aese v2.16b,v8.16b + aesmc v2.16b,v2.16b + mov x9,x0 /* lead_ptr = aes_ptr_in */ + aese v2.16b,v9.16b + aesmc v2.16b,v2.16b + aese v2.16b,v10.16b + aesmc v2.16b,v2.16b + prfm PLDL1KEEP,[x8,10*64] /* rcon */ + aese v2.16b,v11.16b + aesmc v2.16b,v2.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + ld1 {v27.16b},[x3],16 + aese v2.16b,v12.16b + aesmc v2.16b,v2.16b + prfm PLDL1KEEP,[x8,12*64] /* rcon */ + aese v2.16b,v13.16b + aesmc v2.16b,v2.16b + aese v2.16b,v14.16b + aesmc v2.16b,v2.16b + prfm PLDL1KEEP,[x8,14*64] /* rcon */ + aese v2.16b,v15.16b + aesmc v2.16b,v2.16b + aese v2.16b,v16.16b + aesmc v2.16b,v2.16b +___ + &aes192_aes256_handle(0, "enc_prolog", 2, 0); +$code.=<<___; + eor v3.16b,v3.16b,v2.16b /* xor w/ ivec (modeop) */ + + /* aes xform 3 */ + aese v3.16b,v8.16b + aesmc v3.16b,v3.16b + aese v3.16b,v9.16b + aesmc v3.16b,v3.16b + aese v3.16b,v10.16b + aesmc v3.16b,v3.16b + aese v3.16b,v11.16b + aesmc v3.16b,v3.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + ld1 {v28.16b},[x3],16 + aese v3.16b,v12.16b + aesmc v3.16b,v3.16b + aese v3.16b,v13.16b + aesmc v3.16b,v3.16b + aese v3.16b,v14.16b + aesmc v3.16b,v3.16b + aese v3.16b,v15.16b + aesmc v3.16b,v3.16b + aese v3.16b,v16.16b + aesmc v3.16b,v3.16b + /* main_blocks = total_blocks - 1 */ + sub x15,x12,1 + and x13,x10,3 /* aes_blocks_left */ +___ + &aes192_aes256_handle(0, "enc_prolog", 3, 0); +$code.=<<___; + ldp q4,q5,[x8],32 /* key0,key1 */ + /* + * Note, aes_blocks_left := number after + * the main (sha) block is done. Can be 0 + */ + + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + ld1 {v29.16b},[x3],16 + + ldp q6,q7,[x8] /* key2,key3 */ + + /* get outstanding bytes of the digest */ + sub x8,x5,x2 + /* substract loaded bytes */ + sub x5,x5,64 + /* + * main combined loop CBC + */ +.Lenc_main_loop: + /* + * because both mov, rev32 and eor have a busy cycle, this takes longer + * than it looks. + * That's OK since there are 6 cycles before we can use the load anyway; + * so this goes as fast as it can without SW pipelining (too complicated + * given the code size) + */ + rev32 v26.16b,v26.16b + /* next aes block, update aes_ptr_in */ + ld1 {v0.16b},[x0],16 + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + prfm PLDL1KEEP,[x9,64] /* pref next lead_ptr */ + rev32 v27.16b,v27.16b + /* pref next aes_ptr_out, streaming */ + prfm PLDL1KEEP,[x1,64] + eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */ + + /* aes xform 0, sha quad 0 */ + aese v0.16b,v8.16b + aesmc v0.16b,v0.16b + rev32 v28.16b,v28.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x0],16 + aese v0.16b,v9.16b + aesmc v0.16b,v0.16b + add v19.4s,v4.4s,v26.4s + sha1su0 v26.4s,v27.4s,v28.4s + aese v0.16b,v10.16b + aesmc v0.16b,v0.16b + sha1h s22,s24 + aese v0.16b,v11.16b + aesmc v0.16b,v0.16b + add v23.4s,v4.4s,v27.4s + /* no place to get rid of this stall */ + rev32 v29.16b,v29.16b + sha1c q24,s25,v19.4s + aese v0.16b,v12.16b + aesmc v0.16b,v0.16b + sha1su1 v26.4s,v29.4s + sha1su0 v27.4s,v28.4s,v29.4s + aese v0.16b,v13.16b + aesmc v0.16b,v0.16b + sha1h s21,s24 + add v19.4s,v4.4s,v28.4s + sha1c q24,s22,v23.4s + aese v0.16b,v14.16b + aesmc v0.16b,v0.16b + add v23.4s,v4.4s,v29.4s + sha1su1 v27.4s,v26.4s + sha1su0 v28.4s,v29.4s,v26.4s + aese v0.16b,v15.16b + aesmc v0.16b,v0.16b + sha1h s22,s24 + sha1c q24,s21,v19.4s + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + sha1su1 v28.4s,v27.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 +___ + &aes192_aes256_handle(1, "enc_mainloop", 0, 0); +$code.=<<___; + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + sha1su0 v26.4s,v27.4s,v28.4s + add v23.4s,v5.4s,v27.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su1 v26.4s,v29.4s + /* aes xform 1, sha quad 1 */ + eor v1.16b,v1.16b,v0.16b /* mode op 1 xor w/prev value */ + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + aese v1.16b,v8.16b + aesmc v1.16b,v1.16b + add v19.4s,v5.4s,v28.4s + aese v1.16b,v9.16b + aesmc v1.16b,v1.16b + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + aese v1.16b,v10.16b + aesmc v1.16b,v1.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x0],16 + add v23.4s,v5.4s,v29.4s + sha1su1 v27.4s,v26.4s + aese v1.16b,v11.16b + aesmc v1.16b,v1.16b + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + aese v1.16b,v12.16b + aesmc v1.16b,v1.16b + sha1p q24,s21,v19.4s + sha1su1 v28.4s,v27.4s + sha1su0 v29.4s,v26.4s,v27.4s + aese v1.16b,v13.16b + aesmc v1.16b,v1.16b + sha1h s21,s24 + sha1p q24,s22,v23.4s + aese v1.16b,v14.16b + aesmc v1.16b,v1.16b + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + add x9,x9,64 /* bump lead_ptr */ + sha1su0 v26.4s,v27.4s,v28.4s + aese v1.16b,v15.16b + aesmc v1.16b,v1.16b + sha1h s22,s24 + add v23.4s,v5.4s,v27.4s + sha1p q24,s21,v19.4s + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + sha1su1 v26.4s,v29.4s + sha1su0 v27.4s,v28.4s,v29.4s +___ + &aes192_aes256_handle(0, "enc_mainloop", 1, 0); +$code.=<<___; + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v27.4s,v26.4s + + /* mode op 2 */ + eor v2.16b,v2.16b,v1.16b /* mode of 2 xor w/prev value */ + + /* aes xform 2, sha quad 2 */ + aese v2.16b,v8.16b + aesmc v2.16b,v2.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + + add v19.4s,v6.4s,v28.4s + sha1su0 v28.4s,v29.4s,v26.4s + aese v2.16b,v9.16b + aesmc v2.16b,v2.16b + sha1h s22,s24 + sha1m q24,s21,v19.4s + aese v2.16b,v10.16b + aesmc v2.16b,v2.16b + sha1su1 v28.4s,v27.4s + aese v2.16b,v11.16b + aesmc v2.16b,v2.16b + add v19.4s,v6.4s,v26.4s + sha1su0 v29.4s,v26.4s,v27.4s + aese v2.16b,v12.16b + aesmc v2.16b,v2.16b + sha1h s21,s24 + sha1m q24,s22,v23.4s + aese v2.16b,v13.16b + aesmc v2.16b,v2.16b + sha1su1 v29.4s,v28.4s + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x0],16 + aese v2.16b,v14.16b + aesmc v2.16b,v2.16b + add v23.4s,v6.4s,v27.4s + sha1su0 v26.4s,v27.4s,v28.4s + aese v2.16b,v15.16b + aesmc v2.16b,v2.16b + sha1h s22,s24 + sha1m q24,s21,v19.4s + aese v2.16b,v16.16b + aesmc v2.16b,v2.16b + add v19.4s,v6.4s,v28.4s + sha1su1 v26.4s,v29.4s +___ + &aes192_aes256_handle(0, "enc_mainloop", 2, 0); +$code.=<<___; + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v27.4s,v26.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + + /* mode op 3 */ + eor v3.16b,v3.16b,v2.16b /* xor w/prev value */ + + sha1su1 v28.4s,v27.4s + + /* aes xform 3, sha quad 3 */ + aese v3.16b,v8.16b + aesmc v3.16b,v3.16b + sha1su0 v29.4s,v26.4s,v27.4s + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + aese v3.16b,v9.16b + aesmc v3.16b,v3.16b + sha1h s21,s24 + sha1p q24,s22,v23.4s + aese v3.16b,v10.16b + aesmc v3.16b,v3.16b + sha1su1 v29.4s,v28.4s + add v19.4s,v7.4s,v26.4s + aese v3.16b,v11.16b + aesmc v3.16b,v3.16b + sha1h s22,s24 + sha1p q24,s21,v19.4s + aese v3.16b,v12.16b + aesmc v3.16b,v3.16b + add v23.4s,v7.4s,v27.4s + aese v3.16b,v13.16b + aesmc v3.16b,v3.16b + sha1h s21,s24 + sha1p q24,s22,v23.4s + aese v3.16b,v14.16b + aesmc v3.16b,v3.16b + sub x15,x15,1 /* dec block count */ + add v19.4s,v7.4s,v28.4s + aese v3.16b,v15.16b + aesmc v3.16b,v3.16b + sha1h s22,s24 + sha1p q24,s21,v19.4s + aese v3.16b,v16.16b + aesmc v3.16b,v3.16b + add v23.4s,v7.4s,v29.4s +___ + &aes192_aes256_handle(0, "enc_mainloop", 3, 0); +$code.=<<___; + sha1h s21,s24 + sha1p q24,s22,v23.4s + + ldp q26,q27,[x3],32 + + add v25.4s,v25.4s,v21.4s + add v24.4s,v24.4s,v20.4s + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + + ldp q28,q29,[x3],32 + + sub x5,x5,64 + cbnz x15,.Lenc_main_loop /* loop if more to do */ + + mov w15,0x80 /* that's the 1 of the pad */ + /* + * epilog, process remaining aes blocks and b-2 sha block + * do this inline (no loop) to overlap with the sha part + * note there are 0-3 aes blocks left. + */ + rev32 v26.16b,v26.16b /* fix endian w0 */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + rev32 v28.16b,v28.16b /* fix endian w2 */ + rev32 v29.16b,v29.16b /* fix endian w3 */ + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + cbz x13, .Lbm2fromQ0 /* skip if none left */ + + /* + * mode op 0 + * read next aes block, update aes_ptr_in + */ + ld1 {v0.16b},[x0],16 + eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */ + + /* aes xform 0, sha quad 0 */ + add v19.4s,v4.4s,v26.4s + aese v0.16b,v8.16b + aesmc v0.16b,v0.16b + add v23.4s,v4.4s,v27.4s + sha1su0 v26.4s,v27.4s,v28.4s + aese v0.16b,v9.16b + aesmc v0.16b,v0.16b + sha1h s22,s24 + sha1c q24,s25,v19.4s + aese v0.16b,v10.16b + aesmc v0.16b,v0.16b + sha1su1 v26.4s,v29.4s + add v19.4s,v4.4s,v28.4s + sha1su0 v27.4s,v28.4s,v29.4s + aese v0.16b,v11.16b + aesmc v0.16b,v0.16b + sha1h s21,s24 + sha1c q24,s22,v23.4s + aese v0.16b,v12.16b + aesmc v0.16b,v0.16b + sha1su1 v27.4s,v26.4s + add v23.4s,v4.4s,v29.4s + sha1su0 v28.4s,v29.4s,v26.4s + aese v0.16b,v13.16b + aesmc v0.16b,v0.16b + sha1h s22,s24 + sha1c q24,s21,v19.4s + aese v0.16b,v14.16b + aesmc v0.16b,v0.16b + sha1su1 v28.4s,v27.4s + add v19.4s,v4.4s,v26.4s + sha1su0 v29.4s,v26.4s,v27.4s + aese v0.16b,v15.16b + aesmc v0.16b,v0.16b + sha1h s21,s24 + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + sha1c q24,s22,v23.4s + sha1su1 v29.4s,v28.4s +___ + &aes192_aes256_handle(1, "enc_epilog", 0, 0); +$code.=<<___; + /* local copy of aes_blocks_left */ + subs x14,x13,1 + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + /* if aes_blocks_left_count == 0 */ + beq .Lbm2fromQ1 + /* + * mode op 1 + * read next aes block, update aes_ptr_in + */ + ld1 {v1.16b},[x0],16 + + eor v1.16b,v1.16b,v0.16b /* xor w/ prev value */ + + /* aes xform 1, sha quad 1 */ + aese v1.16b,v8.16b + aesmc v1.16b,v1.16b + add v19.4s,v5.4s,v28.4s + sha1su0 v27.4s,v28.4s,v29.4s + aese v1.16b,v9.16b + aesmc v1.16b,v1.16b + sha1h s21,s24 + sha1p q24,s22,v23.4s + aese v1.16b,v10.16b + aesmc v1.16b,v1.16b + sha1su1 v27.4s,v26.4s + add v23.4s,v5.4s,v29.4s + sha1su0 v28.4s,v29.4s,v26.4s + aese v1.16b,v11.16b + aesmc v1.16b,v1.16b + sha1h s22,s24 + sha1p q24,s21,v19.4s + aese v1.16b,v12.16b + aesmc v1.16b,v1.16b + sha1su1 v28.4s,v27.4s + add v19.4s,v5.4s,v26.4s + sha1su0 v29.4s,v26.4s,v27.4s + aese v1.16b,v13.16b + aesmc v1.16b,v1.16b + sha1h s21,s24 + sha1p q24,s22,v23.4s + aese v1.16b,v14.16b + aesmc v1.16b,v1.16b + sha1su1 v29.4s,v28.4s + add v23.4s,v5.4s,v27.4s + sha1su0 v26.4s,v27.4s,v28.4s + aese v1.16b,v15.16b + aesmc v1.16b,v1.16b + sha1h s22,s24 + sha1p q24,s21,v19.4s + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + sha1su1 v26.4s,v29.4s +___ + &aes192_aes256_handle(1, "enc_epilog", 1, 0); +$code.=<<___; + subs x14,x14,1 /* dec counter */ + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + /* if aes_blocks_left_count == 0 */ + beq .Lbm2fromQ2 + + /* + * mode op 2 + * read next aes block, update aes_ptr_in + */ + ld1 {v2.16b},[x0],16 + eor v2.16b,v2.16b,v1.16b /* xor w/ prev value */ + + /* aes xform 2, sha quad 2 */ + aese v2.16b,v8.16b + aesmc v2.16b,v2.16b + add v23.4s,v6.4s,v29.4s + sha1su0 v28.4s,v29.4s,v26.4s + aese v2.16b,v9.16b + aesmc v2.16b,v2.16b + sha1h s22,s24 + sha1m q24,s21,v19.4s + aese v2.16b,v10.16b + aesmc v2.16b,v2.16b + sha1su1 v28.4s,v27.4s + add v19.4s,v6.4s,v26.4s + sha1su0 v29.4s,v26.4s,v27.4s + aese v2.16b,v11.16b + aesmc v2.16b,v2.16b + sha1h s21,s24 + sha1m q24,s22,v23.4s + aese v2.16b,v12.16b + aesmc v2.16b,v2.16b + sha1su1 v29.4s,v28.4s + add v23.4s,v6.4s,v27.4s + sha1su0 v26.4s,v27.4s,v28.4s + aese v2.16b,v13.16b + aesmc v2.16b,v2.16b + sha1h s22,s24 + sha1m q24,s21,v19.4s + aese v2.16b,v14.16b + aesmc v2.16b,v2.16b + sha1su1 v26.4s,v29.4s + add v19.4s,v6.4s,v28.4s + sha1su0 v27.4s,v28.4s,v29.4s + aese v2.16b,v15.16b + aesmc v2.16b,v2.16b + sha1h s21,s24 + aese v2.16b,v16.16b + aesmc v2.16b,v2.16b + sha1m q24,s22,v23.4s + sha1su1 v27.4s,v26.4s +___ + &aes192_aes256_handle(1, "enc_epilog", 2, 0); +$code.=<<___; + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v28.4s,v27.4s + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + /* join common code at Quad 3 */ + b .Lbm2fromQ3 + + /* + * now there is the b-2 sha block before the final one. Execution takes over + * in the appropriate part of this depending on how many aes blocks were left. + * If there were none, the whole thing is executed. + */ +.Lbm2fromQ0: + add v19.4s,v4.4s,v26.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + add v23.4s,v4.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v4.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + +.Lbm2fromQ1: + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + +.Lbm2fromQ2: + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v28.4s,v27.4s + +.Lbm2fromQ3: + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + eor v26.16b,v26.16b,v26.16b /* zero reg */ + sha1p q24,s22,v23.4s + + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + eor v27.16b,v27.16b,v27.16b /* zero reg */ + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + eor v28.16b,v28.16b,v28.16b /* zero reg */ + sha1p q24,s22,v23.4s + + add v25.4s,v25.4s,v21.4s + add v24.4s,v24.4s,v20.4s + + /* Process remaining 0-3 AES blocks here */ + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ + + cbz x13,.Lpost_long_Q0 + + /* 1st remaining AES block */ + ld1 {v26.16b},[x3],16 + sub x5,x5,16 + rev32 v26.16b,v26.16b + subs x14,x13,1 + b.eq .Lpost_long_Q1 + + /* 2nd remaining AES block */ + ld1 {v27.16b},[x3],16 + sub x5,x5,16 + rev32 v27.16b,v27.16b + subs x14,x14,1 + b.eq .Lpost_long_Q2 + + /* 3rd remaining AES block */ + ld1 {v28.16b},[x3],16 + sub x5,x5,16 + rev32 v28.16b,v28.16b + /* Allow for filling this sha1 block with the remaining digest src */ + b .Lpost_long_Q3 + /* + * Process remaining 8B blocks of the digest + */ +.Lpost_long_Q0: + /* blk 0,1 */ + /* assume final block */ + mov v26.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_long_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v26 value (0x80) */ + mov v26.d[0],x2 + /* assume this was final block */ + mov v26.b[11],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_long_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + mov v26.d[1],x2 + +.Lpost_long_Q1: + /* blk 2,3 */ + /* assume this is final block */ + mov v27.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_long_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v27 value (0x80) */ + mov v27.d[0],x2 + /* assume this was final block */ + mov v27.b[11],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_long_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + mov v27.d[1],x2 + +.Lpost_long_Q2: + /* blk 4,5 */ + /* assume this was final block */ + mov v28.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_long_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v28 value (0x80) */ + mov v28.d[0],x2 + /* assume this was final block */ + mov v28.b[11],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_long_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + mov v28.d[1],x2 + +.Lpost_long_Q3: + /* blk 6,7 */ + /* assume this was final block */ + mov v29.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_long_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v29 value (0x80) */ + mov v29.d[0],x2 + /* assume this was final block */ + mov v29.b[11],w15 + /* + * Outstanding 8B blocks left. + * Since there has to be another sha block with padding, + * we need to calculate hash without padding here. + */ + cbz x5,1f + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + rev32 x2,x2 + /* + * Don't decrease x5 here. + * Use it to indicate necessity of constructing "1" padding at the end. + */ + mov v29.d[1],x2 + /* + * That is enough of blocks, we allow up to 64 bytes in total. + * Now we have the sha1 to do for these 4 16B blocks + */ +1: + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + add v19.4s,v4.4s,v26.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + add v23.4s,v4.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v4.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v25.4s,v25.4s,v21.4s + add v24.4s,v24.4s,v20.4s + + eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */ + eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */ + eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */ + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ + + /* this was final block */ + cbz x5,.Lpost_long_loop + subs x5,x5,8 + /* loop if hash is not finished */ + b.ne .Lpost_long_Q0 + /* set "1" of the padding if this was a final block */ + mov v26.b[3],w15 + +.Lpost_long_loop: + /* Add outstanding bytes of digest source */ + add x11,x11,x8 + /* Add one SHA-1 block since hash is calculated including i_key_pad */ + add x11,x11, #64 + lsr x12,x11,32 /* len_hi */ + and x13,x11,0xffffffff /* len_lo */ + lsl x12,x12,3 /* len_hi in bits */ + lsl x13,x13,3 /* len_lo in bits */ + + mov v29.s[3],w13 /* len_lo */ + mov v29.s[2],w12 /* len_hi */ + + /* do last sha of pad block */ + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + add v19.4s,v4.4s,v26.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + add v23.4s,v4.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v4.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v26.4s,v24.4s,v20.4s + add v27.4s,v25.4s,v21.4s + + /* Calculate final HMAC */ + eor v28.16b, v28.16b, v28.16b + eor v29.16b, v29.16b, v29.16b + /* load o_key_pad partial hash */ + ldr q24, [x7] + eor v25.16b, v25.16b, v25.16b + ldr s25, [x7, #16] + + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + + /* Set padding 1 to the first reg */ + mov w11, #0x80 /* that's the 1 of the pad */ + mov v27.b[7], w11 + + mov x11, #64+20 /* size of o_key_pad + inner hash */ + lsl x11, x11, 3 + /* move length to the end of the block */ + mov v29.s[3], w11 + lsr x11, x11, 32 + mov v29.s[2], w11 /* and the higher part */ + + add v19.4s,v4.4s,v26.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + add v23.4s,v4.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v4.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + ldp d14,d15,[sp,#48] + ldp d8,d9,[sp],#64 + + mov x0, xzr + + add v24.4s,v24.4s,v20.4s + add v25.4s,v25.4s,v21.4s + rev32 v24.16b, v24.16b + rev32 v25.16b, v25.16b + + st1 {v24.16b}, [x4],16 + st1 {v25.s}[0], [x4] + + ret + +/* + * These are the short cases (less efficient), here used for 1-11 aes blocks. + * x10 = aes_blocks + */ +.Lenc_short_cases: + ldp q8,q9,[x9],32 + adr x8,.Lrcon /* rcon */ + mov w15,0x80 /* sha padding word */ + ldp q10,q11,[x9],32 + lsl x11,x10,4 /* len = aes_blocks*16 */ + eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */ + ldp q12,q13,[x9],32 + eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */ + eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */ + ldp q14,q15,[x9],32 + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ + ldp q4,q5,[x8],32 /* key0, key1 */ + ldp q16,q17,[x9],32 + ld1 {v3.16b},[x6] /* get ivec */ + ldp q6,q7,[x8] /* key2, key3 */ + /* get outstanding bytes of the digest */ + sub x8,x5,x2 + /* + * the idea in the short loop (at least 1) is to break out with the padding + * already in place excepting the final word. + */ +.Lenc_short_loop: + /* read next aes block, update aes_ptr_in */ + ld1 {v0.16b},[x0],16 + eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */ + + /* aes xform 0 */ + aese v0.16b,v8.16b + aesmc v0.16b,v0.16b + aese v0.16b,v9.16b + aesmc v0.16b,v0.16b + aese v0.16b,v10.16b + aesmc v0.16b,v0.16b + aese v0.16b,v11.16b + aesmc v0.16b,v0.16b + aese v0.16b,v12.16b + aesmc v0.16b,v0.16b + aese v0.16b,v13.16b + aesmc v0.16b,v0.16b + aese v0.16b,v14.16b + aesmc v0.16b,v0.16b + aese v0.16b,v15.16b + aesmc v0.16b,v0.16b + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b +___ + &aes192_aes256_handle(1, "enc_short", 0, 1); +$code.=<<___; + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + /* load next 16 bytes for SHA-1 */ + ld1 {v26.16b},[x3],16 + /* dec number of bytes of the hash input */ + sub x5,x5,16 + sub x10,x10,1 /* dec num_blocks */ + /* load res to sha 0, endian swap */ + rev32 v26.16b,v26.16b + cbz x10,.Lpost_short_Q1 /* break if no more */ + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x0],16 + eor v1.16b,v1.16b,v0.16b /* xor w/ prev value */ + + /* aes xform 1 */ + aese v1.16b,v8.16b + aesmc v1.16b,v1.16b + aese v1.16b,v9.16b + aesmc v1.16b,v1.16b + aese v1.16b,v10.16b + aesmc v1.16b,v1.16b + aese v1.16b,v11.16b + aesmc v1.16b,v1.16b + aese v1.16b,v12.16b + aesmc v1.16b,v1.16b + aese v1.16b,v13.16b + aesmc v1.16b,v1.16b + aese v1.16b,v14.16b + aesmc v1.16b,v1.16b + aese v1.16b,v15.16b + aesmc v1.16b,v1.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b +___ + &aes192_aes256_handle(1, "enc_short", 1, 0); +$code.=<<___; + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + /* load next 16 bytes for SHA-1 */ + ld1 {v27.16b},[x3],16 + /* dec number of bytes of the hash input */ + sub x5,x5,16 + sub x10,x10,1 /* dec num_blocks */ + /* load res to sha 0, endian swap */ + rev32 v27.16b,v27.16b + cbz x10,.Lpost_short_Q2 /* break if no more */ + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x0],16 + eor v2.16b,v2.16b,v1.16b /* xor w/ prev value */ + + /* aes xform 2 */ + aese v2.16b,v8.16b + aesmc v2.16b,v2.16b + aese v2.16b,v9.16b + aesmc v2.16b,v2.16b + aese v2.16b,v10.16b + aesmc v2.16b,v2.16b + aese v2.16b,v11.16b + aesmc v2.16b,v2.16b + aese v2.16b,v12.16b + aesmc v2.16b,v2.16b + aese v2.16b,v13.16b + aesmc v2.16b,v2.16b + aese v2.16b,v14.16b + aesmc v2.16b,v2.16b + aese v2.16b,v15.16b + aesmc v2.16b,v2.16b + aese v2.16b,v16.16b + aesmc v2.16b,v2.16b +___ + &aes192_aes256_handle(1, "enc_short", 2, 0); +$code.=<<___; + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + /* load next 16 bytes for SHA-1 */ + ld1 {v28.16b},[x3],16 + /* dec number of bytes of the hash input */ + sub x5,x5,16 + sub x10,x10,1 /* dec num_blocks */ + /* load res to sha 0, endian swap */ + rev32 v28.16b,v28.16b + cbz x10,.Lpost_short_Q3 /* break if no more */ + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x0],16 + eor v3.16b,v3.16b,v2.16b /* xor w/prev value */ + + /* aes xform 3 */ + aese v3.16b,v8.16b + aesmc v3.16b,v3.16b + aese v3.16b,v9.16b + aesmc v3.16b,v3.16b + aese v3.16b,v10.16b + aesmc v3.16b,v3.16b + aese v3.16b,v11.16b + aesmc v3.16b,v3.16b + aese v3.16b,v12.16b + aesmc v3.16b,v3.16b + aese v3.16b,v13.16b + aesmc v3.16b,v3.16b + aese v3.16b,v14.16b + aesmc v3.16b,v3.16b + aese v3.16b,v15.16b + aesmc v3.16b,v3.16b + aese v3.16b,v16.16b + aesmc v3.16b,v3.16b +___ + &aes192_aes256_handle(1, "enc_short", 3, 0); +$code.=<<___; + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + /* load next 16 bytes for SHA-1 */ + ld1 {v29.16b},[x3],16 + /* dec number of bytes of the hash input */ + sub x5,x5,16 + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + /* load res to sha 0, endian swap */ + rev32 v29.16b,v29.16b + /* + * now we have the sha1 to do for these 4 aes blocks + */ + add v19.4s,v4.4s,v26.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + add v23.4s,v4.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v4.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v25.4s,v25.4s,v21.4s + add v24.4s,v24.4s,v20.4s + + eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */ + eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */ + eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */ + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ + + sub x10,x10,1 /* dec num_blocks */ + cbnz x10,.Lenc_short_loop /* keep looping if more */ + +.Lpost_short_Q0: + /* assume this was final block */ + mov v26.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_short_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v26 value (0x80) */ + mov v26.d[0],x2 + /* assume this was final block */ + mov v26.b[11],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_short_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + mov v26.d[1],x2 +.Lpost_short_Q1: + /* zero out vectors */ + eor v27.16b,v27.16b,v27.16b + eor v28.16b,v28.16b,v28.16b + eor v29.16b,v29.16b,v29.16b + /* assume this is final block */ + mov v27.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_short_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v27 value (0x80) */ + mov v27.d[0],x2 + /* assume this was final block */ + mov v27.b[11],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_short_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + mov v27.d[1],x2 +.Lpost_short_Q2: + /* zero out vectors (repeated if came from Q0) */ + eor v28.16b,v28.16b,v28.16b + eor v29.16b,v29.16b,v29.16b + /* assume this was final block */ + mov v28.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_short_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v28 value (0x80) */ + mov v28.d[0],x2 + /* assume this was final block */ + mov v28.b[11],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_short_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + mov v28.d[1],x2 +.Lpost_short_Q3: + /* zero out vector (repeated if came from Q1) */ + eor v29.16b,v29.16b,v29.16b + /* assume this was final block */ + mov v29.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_short_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v29 value (0x80) */ + mov v29.d[0],x2 + /* assume this was final block */ + mov v29.b[11],w15 + /* outstanding 8B blocks left */ + cbz x5,1f + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + rev32 x2,x2 + mov v29.d[1],x2 + /* + * That is enough of blocks, we allow up to 64 bytes in total. + * Now we have the sha1 to do for these 4 16B blocks + */ +1: + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + + add v19.4s,v4.4s,v26.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + add v23.4s,v4.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v4.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v25.4s,v25.4s,v21.4s + add v24.4s,v24.4s,v20.4s + + eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */ + eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */ + eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */ + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ + + /* this was final block */ + cbz x5,.Lpost_short_loop + subs x5,x5,8 + /* loop if hash is not finished */ + b.ne .Lpost_short_Q0 + /* set "1" of the padding if this was a final block */ + mov v26.b[3],w15 + +/* + * there are between 0 and 3 aes blocks in the final sha1 blocks + */ +.Lpost_short_loop: + /* Add outstanding bytes of digest source */ + add x11,x11,x8 + /* Add one SHA-1 block since hash is calculated including i_key_pad */ + add x11,x11, #64 + lsr x12,x11,32 /* len_hi */ + and x13,x11,0xffffffff /* len_lo */ + lsl x12,x12,3 /* len_hi in bits */ + lsl x13,x13,3 /* len_lo in bits */ + + mov v29.s[3],w13 /* len_lo */ + mov v29.s[2],w12 /* len_hi */ + + /* do final block */ + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + add v19.4s,v4.4s,v26.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + add v23.4s,v4.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v4.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v26.4s,v24.4s,v20.4s + add v27.4s,v25.4s,v21.4s + + /* Calculate final HMAC */ + eor v28.16b, v28.16b, v28.16b + eor v29.16b, v29.16b, v29.16b + /* load o_key_pad partial hash */ + ldr q24, [x7] + eor v25.16b, v25.16b, v25.16b + ldr s25, [x7, #16] + /* Set padding 1 to the first reg */ + mov w11, #0x80 /* that's the 1 of the pad */ + mov v27.b[7], w11 + + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + + mov x11, #64+20 /* size of o_key_pad + inner hash */ + lsl x11, x11, 3 + /* move length to the end of the block */ + mov v29.s[3], w11 + lsr x11, x11, 32 + mov v29.s[2], w11 /* and the higher part */ + add v19.4s,v4.4s,v26.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + add v23.4s,v4.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v4.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + ldp d14,d15,[sp,#48] + ldp d8,d9,[sp],#64 + + mov x0, xzr + + add v24.4s,v24.4s,v20.4s + add v25.4s,v25.4s,v21.4s + rev32 v24.16b, v24.16b + rev32 v25.16b, v25.16b + + st1 {v24.16b}, [x4],16 + st1 {v25.s}[0], [x4] + + ret + +.size asm_aescbc_sha1_hmac, .-asm_aescbc_sha1_hmac + +# Description: +# +# Combined Auth/Dec Primitive = sha1_hmac/aes128cbc +# +# Operations: +# +# out = decrypt-AES128CBC(in) +# return_ash_ptr = SHA1(o_key_pad | SHA1(i_key_pad | in)) +# +# Prototype: +# asm_sha1_hmac_aescbc_dec(uint8_t *csrc, uint8_t *cdst, uint64_t clen, +# uint8_t *dsrc, uint8_t *ddst, uint64_t dlen, +# CIPH_DIGEST *arg) +# +# Registers used: +# +# asm_sha1_hmac_aescbc_dec( +# csrc, x0 (cipher src address) +# cdst, x1 (cipher dst address) +# clen x2 (cipher length) +# dsrc, x3 (digest src address) +# ddst, x4 (digest dst address) +# dlen, x5 (digest length) +# arg x6 : +# arg->cipher.key (round keys) +# arg->cipher.key_rounds (key rounds) +# arg->cipher.iv (initialization vector) +# arg->digest.hmac.i_key_pad (partially hashed i_key_pad) +# arg->digest.hmac.o_key_pad (partially hashed o_key_pad) +# +# +# Routine register definitions: +# +# v0 - v3 -- aes results +# v4 - v7 -- round consts for sha +# v8 - v18 -- round keys +# v19 -- temp register for SHA1 +# v20 -- ABCD copy (q20) +# v21 -- sha working state (q21) +# v22 -- sha working state (q22) +# v23 -- temp register for SHA1 +# v24 -- sha state ABCD +# v25 -- sha state E +# v26 -- sha block 0 +# v27 -- sha block 1 +# v28 -- sha block 2 +# v29 -- sha block 3 +# v30 -- reserved +# v31 -- reserved +# +# +# Constraints: +# +# The variable "clen" must be a multiple of 16, otherwise results are not +# defined. For AES partial blocks the user is required to pad the input +# to modulus 16 = 0. +# +# The variable "dlen" must be a multiple of 8 and greater or equal to "clen". +# The maximum difference between "dlen" and "clen" cannot exceed 64 bytes. +# This constrain is strictly related to the needs of the IPSec ESP packet. +# Short lengths are less optimized at < 16 AES blocks, however they are +# somewhat optimized, and more so than the enc/auth versions. + +.global asm_sha1_hmac_aescbc_dec +.type asm_sha1_hmac_aescbc_dec,%function + +asm_sha1_hmac_aescbc_dec: + AARCH64_VALID_CALL_TARGET + /* protect registers */ + stp d8,d9,[sp,#-64]! + /* fetch args */ + ldr x7, [x6, #HMAC_IKEYPAD] + /* init ABCD, E */ + ldr q24, [x7] + eor v25.16b, v25.16b, v25.16b + ldr s25, [x7, #16] + /* save pointer to o_key_pad partial hash */ + ldr x7, [x6, #HMAC_OKEYPAD] + + stp d10,d11,[sp,#16] + + prfm PLDL1KEEP,[x0,0] /* pref next aes_ptr_in */ + prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */ + lsr x10,x2,4 /* aes_blocks = len/16 */ + + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + + ldr x9, [x6, #CIPHER_KEY] + ldr x16, [x6, #CIPHER_KEY_ROUNDS] + ldr x6, [x6, #CIPHER_IV] + add x17, x9, #160 /* point to the last 5 rounds keys */ + /* + * init sha state, prefetch, check for small cases. + * Note that the output is prefetched as a load, for the in-place case + */ + cmp x10,16 /* no main loop if <16 */ + blt .Ldec_short_cases /* branch if < 12 */ + + /* base address for sha round consts */ + adr x8,.Lrcon + ldp q4,q5,[x8],32 /* key0,key1 */ + ldp q6,q7,[x8],32 /* key2,key3 */ + + /* get outstanding bytes of the digest */ + sub x8,x5,x2 + + mov x11,x2 /* len -> x11 needed at end */ + ld1 {v30.16b},[x6] /* get 1st ivec */ + lsr x12,x11,6 /* total_blocks (sha) */ + ldp q26,q27,[x3],32 /* next w0,w1 */ + rev32 v26.16b,v26.16b /* endian swap w0 */ + rev32 v27.16b,v27.16b /* endian swap w1 */ + ldp q28,q29,[x3],32 /* next w1,w2 */ + rev32 v28.16b,v28.16b /* endian swap w2 */ + rev32 v29.16b,v29.16b /* endian swap w3 */ + + /* substract loaded bytes */ + sub x5,x5,64 + /* + * now we can do the loop prolog, 1st sha1 block + */ + prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */ + prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */ + /* + * do the first sha1 block on the plaintext + */ + mov v20.16b,v24.16b /* init working ABCD */ + + add v19.4s,v4.4s,v26.4s + add v23.4s,v4.4s,v27.4s + /* quad 0 */ + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + ld1 {v8.16b},[x9],16 /* rk[0] */ + sha1c q24,s25,v19.4s + sha1su1 v26.4s,v29.4s + ld1 {v9.16b},[x9],16 /* rk[1] */ + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + add v19.4s,v4.4s,v28.4s + ld1 {v10.16b},[x9],16 /* rk[2] */ + sha1c q24,s22,v23.4s + sha1su1 v27.4s,v26.4s + add v23.4s,v4.4s,v29.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + ld1 {v11.16b},[x9],16 /* rk[3] */ + sha1c q24,s21,v19.4s + sha1su1 v28.4s,v27.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + add v23.4s,v5.4s,v27.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + ld1 {v12.16b},[x9],16 /* rk[4] */ + sha1c q24,s21,v19.4s + add v19.4s,v5.4s,v28.4s + sha1su1 v26.4s,v29.4s + ld1 {v13.16b},[x9],16 /* rk[5] */ + /* quad 1 */ + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + ld1 {v14.16b},[x9],16 /* rk[6] */ + sha1p q24,s22,v23.4s + sha1su1 v27.4s,v26.4s + add v23.4s,v5.4s,v29.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + ld1 {v15.16b},[x9],16 /* rk[7] */ + sha1p q24,s21,v19.4s + sha1su1 v28.4s,v27.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + add v23.4s,v5.4s,v27.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + ld1 {v16.16b},[x9],16 /* rk[8] */ + sha1p q24,s21,v19.4s + sha1su1 v26.4s,v29.4s + ld1 {v17.16b},[x9],16 /* rk[9] */ + add v19.4s,v6.4s,v28.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + ld1 {v18.16b},[x9],16 /* rk[10] */ + sha1p q24,s22,v23.4s + sha1su1 v27.4s,v26.4s + /* quad 2 */ + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v28.4s,v27.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v26.4s + sha1su1 v29.4s,v28.4s + add v23.4s,v6.4s,v27.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v26.4s,v29.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v27.4s,v26.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su1 v28.4s,v27.4s + /* quad 3 */ + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v26.4s + sha1su1 v29.4s,v28.4s + sha1h s22,s24 + ld1 {v26.16b},[x3],16 /* next w0 */ + sha1p q24,s21,v19.4s + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + ld1 {v27.16b},[x3],16 /* next w1 */ + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + ld1 {v28.16b},[x3],16 /* next w2 */ + sha1p q24,s21,v19.4s + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + ld1 {v29.16b},[x3],16 /* next w3 */ + sha1p q24,s22,v23.4s + + /* substract loaded bytes */ + sub x5,x5,64 + /* + * aes_blocks_left := number after the main (sha) block is done. + * can be 0 note we account for the extra unwind in main_blocks + */ + sub x15,x12,2 /* main_blocks=total_blocks-5 */ + add v24.4s,v24.4s,v20.4s + and x13,x10,3 /* aes_blocks_left */ + ld1 {v0.16b},[x0] /* next aes block, no update */ + add v25.4s,v25.4s,v21.4s + /* next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + + /* indicate AES blocks to write back */ + mov x9,xzr + /* + * main combined loop CBC, can be used by auth/enc version + */ +.Ldec_main_loop: + /* + * Because both mov, rev32 and eor have a busy cycle, + * this takes longer than it looks. + */ + rev32 v26.16b,v26.16b /* fix endian w0 */ + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + /* pref next aes_ptr_out, streaming */ + prfm PLDL1KEEP,[x1,64] + /* aes xform 0, sha quad 0 */ + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + rev32 v28.16b,v28.16b /* fix endian w2 */ + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + add v19.4s,v4.4s,v26.4s + sha1su0 v26.4s,v27.4s,v28.4s + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + sha1h s22,s24 + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + add v23.4s,v4.4s,v27.4s + rev32 v29.16b,v29.16b /* fix endian w3 */ + /* read next aes block, no update */ + ld1 {v1.16b},[x0] + sha1c q24,s25,v19.4s + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + sha1su1 v26.4s,v29.4s + sha1su0 v27.4s,v28.4s,v29.4s + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + sha1h s21,s24 + add v19.4s,v4.4s,v28.4s + sha1c q24,s22,v23.4s + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + add v23.4s,v4.4s,v29.4s + sha1su1 v27.4s,v26.4s + sha1su0 v28.4s,v29.4s,v26.4s + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + sha1h s22,s24 + sha1c q24,s21,v19.4s + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + sha1su1 v28.4s,v27.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s +___ + &aes192_aes256_dec_handle(1,"dec_mainloop",0,0); +$code.=<<___; + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */ + /* get next aes block, with update */ + ld1 {v30.16b},[x0],16 + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + /* aes xform 1, sha quad 1 */ + sha1su0 v27.4s,v28.4s,v29.4s + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + aesd v1.16b,v8.16b + aesimc v1.16b,v1.16b + sha1h s21,s24 + add v19.4s,v5.4s,v28.4s + sha1p q24,s22,v23.4s + sha1su1 v27.4s,v26.4s + aesd v1.16b,v9.16b + aesimc v1.16b,v1.16b + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + aesd v1.16b,v10.16b + aesimc v1.16b,v1.16b + /* read next aes block, no update */ + ld1 {v2.16b},[x0] + add v23.4s,v5.4s,v29.4s + sha1su1 v28.4s,v27.4s + aesd v1.16b,v11.16b + aesimc v1.16b,v1.16b + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + aesd v1.16b,v12.16b + aesimc v1.16b,v1.16b + sha1p q24,s22,v23.4s + sha1su1 v29.4s,v28.4s + aesd v1.16b,v13.16b + aesimc v1.16b,v1.16b + sha1h s22,s24 + add v19.4s,v5.4s,v26.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1p q24,s21,v19.4s + aesd v1.16b,v14.16b + aesimc v1.16b,v1.16b + sha1su1 v26.4s,v29.4s + aesd v1.16b,v15.16b + aesimc v1.16b,v1.16b + add v23.4s,v5.4s,v27.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + sha1su1 v27.4s,v26.4s +___ + &aes192_aes256_dec_handle(1,"dec_mainloop",1,0); +$code.=<<___; + add v19.4s,v6.4s,v28.4s + add v23.4s,v6.4s,v29.4s + eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + /* aes xform 2, sha quad 2 */ + sha1su0 v28.4s,v29.4s,v26.4s + aesd v2.16b,v8.16b + aesimc v2.16b,v2.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + sha1h s22,s24 + sha1m q24,s21,v19.4s + aesd v2.16b,v9.16b + aesimc v2.16b,v2.16b + sha1su1 v28.4s,v27.4s + sha1su0 v29.4s,v26.4s,v27.4s + aesd v2.16b,v10.16b + aesimc v2.16b,v2.16b + sha1h s21,s24 + sha1m q24,s22,v23.4s + aesd v2.16b,v11.16b + aesimc v2.16b,v2.16b + sha1su1 v29.4s,v28.4s + add v19.4s,v6.4s,v26.4s + sha1su0 v26.4s,v27.4s,v28.4s + aesd v2.16b,v12.16b + aesimc v2.16b,v2.16b + sha1h s22,s24 + sha1m q24,s21,v19.4s + aesd v2.16b,v13.16b + aesimc v2.16b,v2.16b + sha1su1 v26.4s,v29.4s + add v23.4s,v6.4s,v27.4s + sha1su0 v27.4s,v28.4s,v29.4s + /* read next aes block, no update */ + ld1 {v3.16b},[x0] + aesd v2.16b,v14.16b + aesimc v2.16b,v2.16b + sha1h s21,s24 + sha1m q24,s22,v23.4s + aesd v2.16b,v15.16b + aesimc v2.16b,v2.16b + sha1su1 v27.4s,v26.4s + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b + sha1su0 v28.4s,v29.4s,v26.4s + sha1m q24,s21,v19.4s +___ + &aes192_aes256_dec_handle(1,"dec_mainloop",2,0); +$code.=<<___; + sha1su1 v28.4s,v27.4s + add v23.4s,v7.4s,v29.4s + add v19.4s,v7.4s,v26.4s + eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + /* aes xform 3, sha quad 3 */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + sha1h s21,s24 + aesd v3.16b,v9.16b + aesimc v3.16b,v3.16b + sha1su0 v29.4s,v26.4s,v27.4s + aesd v3.16b,v10.16b + aesimc v3.16b,v3.16b + sha1p q24,s22,v23.4s + sha1su1 v29.4s,v28.4s + aesd v3.16b,v11.16b + aesimc v3.16b,v3.16b + sha1h s22,s24 + ld1 {v26.16b},[x3],16 /* next w0 */ + sha1p q24,s21,v19.4s + aesd v3.16b,v12.16b + aesimc v3.16b,v3.16b + add v23.4s,v7.4s,v27.4s + aesd v3.16b,v13.16b + aesimc v3.16b,v3.16b + sha1h s21,s24 + ld1 {v27.16b},[x3],16 /* next w1 */ + sha1p q24,s22,v23.4s + aesd v3.16b,v14.16b + aesimc v3.16b,v3.16b + sub x15,x15,1 /* dec block count */ + add v19.4s,v7.4s,v28.4s + aesd v3.16b,v15.16b + aesimc v3.16b,v3.16b + ld1 {v0.16b},[x0] /* next aes block, no update */ + sha1h s22,s24 + ld1 {v28.16b},[x3],16 /* next w2 */ + sha1p q24,s21,v19.4s + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b +___ + &aes192_aes256_dec_handle(1,"dec_mainloop",3,0); +$code.=<<___; + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + ld1 {v29.16b},[x3],16 /* next w3 */ + sha1p q24,s22,v23.4s + add v24.4s,v24.4s,v20.4s + eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */ + /* next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + add v25.4s,v25.4s,v21.4s + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + /* substract loaded bytes */ + sub x5,x5,64 + /* loop if more to do */ + cbnz x15,.Ldec_main_loop + /* + * Now the loop epilog. Since the reads for sha have already been done + * in advance, we have to have an extra unwind. + * This is why the test for the short cases is 16 and not 12. + * + * The unwind, which is just the main loop without the tests or final reads. + */ + rev32 v26.16b,v26.16b /* fix endian w0 */ + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + /* pref next aes_ptr_out, streaming */ + prfm PLDL1KEEP,[x1,64] + /* aes xform 0, sha quad 0 */ + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + add v19.4s,v4.4s,v26.4s + rev32 v28.16b,v28.16b /* fix endian w2 */ + sha1su0 v26.4s,v27.4s,v28.4s + /* read next aes block, no update */ + ld1 {v1.16b},[x0] + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + sha1h s22,s24 + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + add v23.4s,v4.4s,v27.4s + sha1c q24,s25,v19.4s + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + rev32 v29.16b,v29.16b /* fix endian w3 */ + sha1su1 v26.4s,v29.4s + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + add v19.4s,v4.4s,v28.4s + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + sha1c q24,s22,v23.4s + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + add v23.4s,v4.4s,v29.4s + sha1su1 v27.4s,v26.4s + sha1su0 v28.4s,v29.4s,v26.4s + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + sha1h s22,s24 + sha1c q24,s21,v19.4s + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + sha1su1 v28.4s,v27.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s +___ + &aes192_aes256_dec_handle(1,"dec_epilog",0,0); +$code.=<<___; + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + add v23.4s,v5.4s,v27.4s + eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su1 v26.4s,v29.4s + /* aes xform 1, sha quad 1 */ + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + sha1su0 v27.4s,v28.4s,v29.4s + aesd v1.16b,v8.16b + aesimc v1.16b,v1.16b + sha1h s21,s24 + add v19.4s,v5.4s,v28.4s + sha1p q24,s22,v23.4s + aesd v1.16b,v9.16b + aesimc v1.16b,v1.16b + add v23.4s,v5.4s,v29.4s + sha1su1 v27.4s,v26.4s + aesd v1.16b,v10.16b + aesimc v1.16b,v1.16b + sha1su0 v28.4s,v29.4s,v26.4s + /* read next aes block, no update */ + ld1 {v2.16b},[x0] + sha1h s22,s24 + aesd v1.16b,v11.16b + aesimc v1.16b,v1.16b + sha1p q24,s21,v19.4s + aesd v1.16b,v12.16b + aesimc v1.16b,v1.16b + sha1su1 v28.4s,v27.4s + sha1su0 v29.4s,v26.4s,v27.4s + aesd v1.16b,v13.16b + aesimc v1.16b,v1.16b + sha1h s21,s24 + sha1p q24,s22,v23.4s + aesd v1.16b,v14.16b + aesimc v1.16b,v1.16b + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + aesd v1.16b,v15.16b + aesimc v1.16b,v1.16b + add v23.4s,v5.4s,v27.4s + sha1su0 v26.4s,v27.4s,v28.4s + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su1 v26.4s,v29.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s +___ + &aes192_aes256_dec_handle(1,"dec_epilog",1,0); +$code.=<<___; + eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + add v19.4s,v6.4s,v28.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v27.4s,v26.4s + /* mode op 2 */ + /* aes xform 2, sha quad 2 */ + aesd v2.16b,v8.16b + aesimc v2.16b,v2.16b + sha1su0 v28.4s,v29.4s,v26.4s + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + aesd v2.16b,v9.16b + aesimc v2.16b,v2.16b + sha1h s22,s24 + sha1m q24,s21,v19.4s + aesd v2.16b,v10.16b + aesimc v2.16b,v2.16b + sha1su1 v28.4s,v27.4s + add v19.4s,v6.4s,v26.4s + aesd v2.16b,v11.16b + aesimc v2.16b,v2.16b + sha1su0 v29.4s,v26.4s,v27.4s + aesd v2.16b,v12.16b + aesimc v2.16b,v2.16b + sha1h s21,s24 + sha1m q24,s22,v23.4s + aesd v2.16b,v13.16b + aesimc v2.16b,v2.16b + sha1su1 v29.4s,v28.4s + /* read next aes block, no update */ + ld1 {v3.16b},[x0] + aesd v2.16b,v14.16b + aesimc v2.16b,v2.16b + add v23.4s,v6.4s,v27.4s + sha1su0 v26.4s,v27.4s,v28.4s + aesd v2.16b,v15.16b + aesimc v2.16b,v2.16b + sha1h s22,s24 + sha1m q24,s21,v19.4s + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b + add v19.4s,v6.4s,v28.4s + sha1su1 v26.4s,v29.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su1 v27.4s,v26.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s +___ + &aes192_aes256_dec_handle(1,"dec_epilog",2,0); +$code.=<<___; + eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + sha1su1 v28.4s,v27.4s + add v23.4s,v7.4s,v29.4s + /* mode op 3 */ + /* aes xform 3, sha quad 3 */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + sha1su0 v29.4s,v26.4s,v27.4s + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + aesd v3.16b,v9.16b + aesimc v3.16b,v3.16b + sha1h s21,s24 + sha1p q24,s22,v23.4s + aesd v3.16b,v10.16b + aesimc v3.16b,v3.16b + sha1su1 v29.4s,v28.4s + add v19.4s,v7.4s,v26.4s + aesd v3.16b,v11.16b + aesimc v3.16b,v3.16b + sha1h s22,s24 + sha1p q24,s21,v19.4s + aesd v3.16b,v12.16b + aesimc v3.16b,v3.16b + /* read first aes block, no bump */ + ld1 {v0.16b},[x0] + add v23.4s,v7.4s,v27.4s + aesd v3.16b,v13.16b + aesimc v3.16b,v3.16b + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v28.4s + aesd v3.16b,v14.16b + aesimc v3.16b,v3.16b + sha1h s22,s24 + sha1p q24,s21,v19.4s + aesd v3.16b,v15.16b + aesimc v3.16b,v3.16b + add v23.4s,v7.4s,v29.4s + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b + sha1h s21,s24 + sha1p q24,s22,v23.4s +___ + &aes192_aes256_dec_handle(1,"dec_epilog",3,0); +$code.=<<___; + eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v31.16b},[x0],16 + + add v25.4s,v25.4s,v21.4s + add v24.4s,v24.4s,v20.4s + + /* + * now we have to do the 4 aes blocks (b-2) that catch up to where sha is + */ + + /* aes xform 0 */ + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + /* read next aes block, no update */ + ld1 {v1.16b},[x0] + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b +___ + &aes192_aes256_dec_handle(1,"dec_catchup",0,0); +$code.=<<___; + eor v0.16b,v0.16b,v30.16b /* xor w/ ivec (modeop) */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + + /* aes xform 1 */ + aesd v1.16b,v8.16b + aesimc v1.16b,v1.16b + /* read next aes block, no update */ + ld1 {v2.16b},[x0] + aesd v1.16b,v9.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v10.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v11.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v12.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v13.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v14.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v15.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b +___ + &aes192_aes256_dec_handle(1,"dec_catchup",1,0); +$code.=<<___; + eor v1.16b,v1.16b,v31.16b /* xor w/ ivec (modeop) */ + /* read next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + + /* aes xform 2 */ + aesd v2.16b,v8.16b + aesimc v2.16b,v2.16b + /* read next aes block, no update */ + ld1 {v3.16b},[x0] + aesd v2.16b,v9.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v10.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v11.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v12.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v13.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v14.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v15.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b +___ + &aes192_aes256_dec_handle(1,"dec_catchup",2,0); +$code.=<<___; + eor v2.16b,v2.16b,v30.16b /* xor w/ ivec (modeop) */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + + /* aes xform 3 */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v9.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v10.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v11.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v12.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v13.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v14.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v15.16b + aesimc v3.16b,v3.16b + eor v26.16b,v26.16b,v26.16b /* zero the rest */ + eor v27.16b,v27.16b,v27.16b /* zero the rest */ + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b + eor v28.16b,v28.16b,v28.16b /* zero the rest */ + eor v29.16b,v29.16b,v29.16b /* zero the rest */ +___ + &aes192_aes256_dec_handle(1,"dec_catchup",3,0); +$code.=<<___; + eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */ + + add x9,x9,4 + +/* + * Now, there is the final b-1 sha1 padded block. + * This contains between 0-3 aes blocks. We take some pains to avoid read spill + * by only reading the blocks that are actually defined. + * this is also the final sha block code for the short_cases. + */ +.Ljoin_common: + mov w15,0x80 /* that's the 1 of the pad */ +.Lpost_loop_Q0: + /* assume this was final block */ + mov v26.b[0],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + /* overwrite previous v26 value (0x80) */ + mov v26.d[0],x2 + /* assume this was final block */ + mov v26.b[8],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + mov v26.d[1],x2 +.Lpost_loop_Q1: + /* assume this is final block */ + mov v27.b[0],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + /* overwrite previous v27 value (0x80) */ + mov v27.d[0],x2 + /* assume this was final block */ + mov v27.b[8],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + mov v27.d[1],x2 +.Lpost_loop_Q2: + /* assume this was final block */ + mov v28.b[0],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + /* overwrite previous v28 value (0x80) */ + mov v28.d[0],x2 + /* assume this was final block */ + mov v28.b[8],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + mov v28.d[1],x2 +.Lpost_loop_Q3: + /* assume this was final block */ + mov v29.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v29 value (0x80) */ + mov v29.d[0],x2 + /* assume this was final block */ + mov v29.b[11],w15 + /* outstanding 8B blocks left */ + cbz x5,1f + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + rev32 x2,x2 + mov v29.d[1],x2 + +/* + * That is enough of blocks, we allow up to 64 bytes in total. + * Now we have the sha1 to do for these 4 16B blocks + */ +1: + rev32 v26.16b,v26.16b + rev32 v27.16b,v27.16b + rev32 v28.16b,v28.16b + //rev32 v29.16b,v29.16b + + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + + add v19.4s,v4.4s,v26.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + add v23.4s,v4.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v4.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v26.4s + sha1su1 v29.4s,v28.4s + + add v23.4s,v7.4s,v27.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v19.4s,v7.4s,v28.4s + sha1h s21,s24 + eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */ + sha1p q24,s22,v23.4s + + add v23.4s,v7.4s,v29.4s + sha1h s22,s24 + eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */ + sha1p q24,s21,v19.4s + + sha1h s21,s24 + eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */ + sha1p q24,s22,v23.4s + + add v25.4s,v25.4s,v21.4s + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ + add v24.4s,v24.4s,v20.4s + + /* this was final block */ + cbz x5,.Lpost_loop + subs x5,x5,8 + /* loop if hash is not finished */ + b.ne .Lpost_loop_Q0 + /* set "1" of the padding if this was a final block */ + mov v26.b[0],w15 + +.Lpost_loop: + /* Add outstanding bytes of digest source */ + add x11,x11,x8 + /* Add one SHA-1 block since hash is calculated including i_key_pad */ + add x11,x11,#64 + lsr x12,x11,32 /* len_hi */ + and x14,x11,0xffffffff /* len_lo */ + lsl x12,x12,3 /* len_hi in bits */ + lsl x14,x14,3 /* len_lo in bits */ + + rev32 v26.16b,v26.16b /* fix endian w0 */ + mov v29.s[3],w14 /* len_lo */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + mov v29.s[2],w12 /* len_hi */ + rev32 v28.16b,v28.16b /* fix endian w2 */ + + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + /* skip write back if there were less than 4 AES blocks */ + cbz x9,1f + /* + * At this point all data should be fetched for SHA. + * Save remaining blocks without danger of overwriting SHA source. + */ + stp q0,q1,[x1],32 + stp q2,q3,[x1],32 +1: + /* + * final sha block + * The strategy is to combine the 0-3 aes blocks, which is faster but + * a little gourmand on code space. + */ + cbz x13,.Lzero_aes_blocks_left /* none to do */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0] + ld1 {v31.16b},[x0],16 + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + add v19.4s,v4.4s,v26.4s + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + add v23.4s,v4.4s,v27.4s + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + sha1c q24,s25,v19.4s + sha1su1 v26.4s,v29.4s + sha1su0 v27.4s,v28.4s,v29.4s + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + sha1h s21,s24 + sha1c q24,s22,v23.4s + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + sha1su1 v27.4s,v26.4s + add v19.4s,v4.4s,v28.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + sha1c q24,s21,v19.4s + sha1su1 v28.4s,v27.4s + add v23.4s,v4.4s,v29.4s + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s +___ + &aes192_aes256_dec_handle(1,"dec_final1",0,0); +$code.=<<___; + sha1su1 v29.4s,v28.4s + eor v3.16b,v0.16b,v30.16b /* xor w/ ivec (modeop) */ + add v19.4s,v4.4s,v26.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + sha1su1 v26.4s,v29.4s + /* dec counter */ + sub x13,x13,1 + cbz x13,.Lfrmquad1 + + /* aes xform 1 */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0] + ld1 {v30.16b},[x0],16 + add v23.4s,v5.4s,v27.4s + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + add v19.4s,v5.4s,v28.4s + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + sha1su0 v27.4s,v28.4s,v29.4s + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + sha1h s21,s24 + sha1p q24,s22,v23.4s + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + sha1su1 v27.4s,v26.4s + sha1su0 v28.4s,v29.4s,v26.4s + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + sha1h s22,s24 + sha1p q24,s21,v19.4s + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + sha1su1 v28.4s,v27.4s + add v23.4s,v5.4s,v29.4s + sha1su0 v29.4s,v26.4s,v27.4s + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + sha1h s21,s24 + sha1p q24,s22,v23.4s + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + sha1su1 v29.4s,v28.4s + add v19.4s,v5.4s,v26.4s + sha1su0 v26.4s,v27.4s,v28.4s + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + sha1h s22,s24 + sha1p q24,s21,v19.4s +___ + &aes192_aes256_dec_handle(1,"dec_final2",0,0); +$code.=<<___; + sha1su1 v26.4s,v29.4s + eor v3.16b,v0.16b,v31.16b /* xor w/ ivec (modeop) */ + add v23.4s,v5.4s,v27.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + sha1su1 v27.4s,v26.4s + + sub x13,x13,1 /* dec counter */ + cbz x13,.Lfrmquad2 + + /* aes xform 2 */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0],16 + add v19.4s,v6.4s,v28.4s + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + add v23.4s,v6.4s,v29.4s + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + sha1m q24,s21,v19.4s + sha1su1 v28.4s,v27.4s + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + sha1m q24,s22,v23.4s + sha1su1 v29.4s,v28.4s + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + add v19.4s,v6.4s,v26.4s + sha1su0 v26.4s,v27.4s,v28.4s + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + sha1h s22,s24 + sha1m q24,s21,v19.4s + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + sha1su1 v26.4s,v29.4s + add v23.4s,v6.4s,v27.4s + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s +___ + &aes192_aes256_dec_handle(1,"dec_final3",0,0); +$code.=<<___; + sha1su1 v27.4s,v26.4s + eor v3.16b,v0.16b,v30.16b /* xor w/ ivec (modeop) */ + add v19.4s,v6.4s,v28.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + sha1su1 v28.4s,v27.4s + b .Lfrmquad3 + +/* + * The final block with no aes component, i.e from here there were zero blocks + */ +.Lzero_aes_blocks_left: + + add v19.4s,v4.4s,v26.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + add v23.4s,v4.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v4.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su1 v26.4s,v29.4s + +/* quad 1 */ +.Lfrmquad1: + add v23.4s,v5.4s,v27.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su1 v27.4s,v26.4s + +/* quad 2 */ +.Lfrmquad2: + add v19.4s,v6.4s,v28.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su1 v28.4s,v27.4s + +/* quad 3 */ +.Lfrmquad3: + add v23.4s,v7.4s,v29.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v26.4s + sha1su1 v29.4s,v28.4s + + add v23.4s,v7.4s,v27.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v19.4s,v7.4s,v28.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v23.4s,v7.4s,v29.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v26.4s,v24.4s,v20.4s + add v27.4s,v25.4s,v21.4s + + /* calculate final HMAC */ + eor v28.16b, v28.16b, v28.16b + eor v29.16b, v29.16b, v29.16b + /* load o_key_pad partial hash */ + ldr q24, [x7] + eor v25.16b, v25.16b, v25.16b + ldr s25, [x7, #16] + /* working ABCD <- ABCD */ + mov v20.16b,v24.16b + + /* set padding 1 to the first reg */ + mov w11, #0x80 /* that's the 1 of the pad */ + mov v27.b[7], w11 + /* size of o_key_pad + inner hash */ + mov x11, #64+20 + /* move length to the end of the block */ + lsl x11, x11, 3 + mov v29.s[3], w11 + lsr x11, x11, 32 + mov v29.s[2], w11 /* and the higher part */ + + add v19.4s,v4.4s,v26.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + add v23.4s,v4.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v4.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v26.4s + sha1su1 v29.4s,v28.4s + + add v23.4s,v7.4s,v27.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v19.4s,v7.4s,v28.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + + add v23.4s,v7.4s,v29.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + sha1h s21,s24 + sha1p q24,s22,v23.4s + + ldp d14,d15,[sp,#48] + ldp d8,d9,[sp],#64 + + mov x0, xzr + + add v24.4s,v24.4s,v20.4s + add v25.4s,v25.4s,v21.4s + + rev32 v24.16b, v24.16b + rev32 v25.16b, v25.16b + + st1 {v24.16b}, [x4],16 + st1 {v25.s}[0], [x4] + + ret + +/* + * These are the short cases (less efficient), here used for 1-11 aes blocks. + * x10 = aes_blocks + */ +.Ldec_short_cases: + ldp q8,q9,[x9],32 + adr x8,.Lrcon /* rcon */ + ldp q10,q11,[x9],32 + lsl x11,x10,4 /* len = aes_blocks*16 */ + + ldp q12,q13,[x9],32 + ldp q4,q5,[x8],32 /* key0, key1 */ + ldp q14,q15,[x9],32 + ld1 {v30.16b},[x6] /* get ivec */ + ldp q16,q17,[x9],32 + ldp q6,q7,[x8] /* key2, key3 */ + ld1 {v18.16b},[x9] + + /* get outstanding bytes of the digest */ + sub x8,x5,x2 + + /* indicate AES blocks to write back */ + mov x9,xzr + + mov x2,x0 + /* + * Digest source has to be at least of cipher source length + * therefore it is safe to use x10 to indicate whether we can + * overtake cipher processing by 4 AES block here. + */ + cmp x10,4 /* check if 4 or more */ + /* if less, bail to last block */ + blt .Llast_sha_block + + ldp q26,q27,[x3],32 + rev32 v26.16b,v26.16b + rev32 v27.16b,v27.16b + ldp q28,q29,[x3],32 + rev32 v28.16b,v28.16b + rev32 v29.16b,v29.16b + + sub x5,x5,64 + + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + + /* quad 0 */ + add v19.4s,v4.4s,v26.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + add v23.4s,v4.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v4.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + /* quad 1 */ + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + /* quad 2 */ + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v26.4s + sha1su1 v29.4s,v28.4s + + /* quad 3 */ + add v23.4s,v7.4s,v27.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v19.4s,v7.4s,v28.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v23.4s,v7.4s,v29.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v25.4s,v25.4s,v21.4s + add v24.4s,v24.4s,v20.4s + + /* there were at least 4 AES blocks to process */ + b .Lshort_loop_no_store + +.Ldec_short_loop: + cmp x10,4 /* check if 4 or more */ + /* if less, bail to last block */ + blt .Llast_sha_block + + stp q0,q1,[x1],32 + stp q2,q3,[x1],32 + + sub x9,x9,4 + +.Lshort_loop_no_store: + + ld1 {v31.16b},[x2] /* next w no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v0.16b},[x2],16 + + add x0,x0,64 + + /* aes xform 0 */ + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b +___ + &aes192_aes256_dec_handle(1,"dec_short",0,0); +$code.=<<___; + eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */ + + ld1 {v30.16b},[x2] /* read no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x2],16 + + /* aes xform 1 */ + aesd v1.16b,v8.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v9.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v10.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v11.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v12.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v13.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v14.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v15.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b +___ + &aes192_aes256_dec_handle(1,"dec_short",1,0); +$code.=<<___; + eor v1.16b,v1.16b,v31.16b /* xor w/ prev value */ + + ld1 {v31.16b},[x2] /* read no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x2],16 + + /* aes xform 2 */ + aesd v2.16b,v8.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v9.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v10.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v11.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v12.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v13.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v14.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v15.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b +___ + &aes192_aes256_dec_handle(1,"dec_short",2,0); +$code.=<<___; + eor v2.16b,v2.16b,v30.16b /* xor w/ prev value */ + + ld1 {v30.16b},[x2] /* read no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x2],16 + + /* aes xform 3 */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v9.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v10.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v11.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v12.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v13.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v14.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v15.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b +___ + &aes192_aes256_dec_handle(1,"dec_short",3,0); +$code.=<<___; + eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */ + + add x9,x9,4 + + sub x10,x10,4 /* 4 less */ + cmp x5,64 + b.lt .Ldec_short_loop /* keep looping */ + + ldp q26,q27,[x3],32 + rev32 v26.16b,v26.16b + rev32 v27.16b,v27.16b + ldp q28,q29,[x3],32 + rev32 v28.16b,v28.16b + rev32 v29.16b,v29.16b + + sub x5,x5,64 + + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + + /* quad 0 */ + add v19.4s,v4.4s,v26.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + add v23.4s,v4.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v4.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + /* quad 1 */ + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + /* quad 2 */ + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v28.4s,v27.4s + + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v26.4s + sha1su1 v29.4s,v28.4s + + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v6.4s,v27.4s + sha1su1 v26.4s,v29.4s + + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v27.4s,v26.4s + + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v28.4s,v27.4s + + /* quad 3 */ + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v26.4s + sha1su1 v29.4s,v28.4s + + add v23.4s,v7.4s,v27.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v19.4s,v7.4s,v28.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v23.4s,v7.4s,v29.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v25.4s,v25.4s,v21.4s + add v24.4s,v24.4s,v20.4s + + b .Ldec_short_loop /* keep looping */ +/* + * This is arranged so that we can join the common unwind code + * that does the last sha block and the final 0-3 aes blocks + */ +.Llast_sha_block: + eor v26.16b,v26.16b,v26.16b /* zero the rest */ + eor v27.16b,v27.16b,v27.16b /* zero the rest */ + eor v28.16b,v28.16b,v28.16b /* zero the rest */ + eor v29.16b,v29.16b,v29.16b /* zero the rest */ + + mov x13,x10 /* copy aes blocks for common */ + b .Ljoin_common /* join common code */ + +.size asm_sha1_hmac_aescbc_dec, .-asm_sha1_hmac_aescbc_dec +___ + +if ($flavour =~ /64/) { + foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/geo; + print $_,"\n"; + } +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/aes/asm/aes-sha256-armv8.pl b/crypto/aes/asm/aes-sha256-armv8.pl new file mode 100644 index 00000000000..766b2ef0f72 --- /dev/null +++ b/crypto/aes/asm/aes-sha256-armv8.pl @@ -0,0 +1,4631 @@ +#! /usr/bin/env perl + +# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +# Copyright (C) Cavium networks Ltd. 2016. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +#======================================================================== +# Derived from following files in +# https://github.com/ARM-software/AArch64cryptolib +# AArch64cryptolib_opt_big/aes_cbc_sha256/aes128cbc_sha256_hmac.S +# AArch64cryptolib_opt_big/aes_cbc_sha256/sha256_hmac_aes128cbc_dec.S +#======================================================================== + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +$code=<<___; +#include "arm_arch.h" + +# Theses are offsets into the CIPH_DIGEST struct +#define CIPHER_KEY 0 +#define CIPHER_KEY_ROUNDS 8 +#define CIPHER_IV 16 +#define HMAC_IKEYPAD 24 +#define HMAC_OKEYPAD 32 + +.text +.arch armv8-a+crypto +___ + +sub aes192_aes256_handle () { + my $compare = shift; + my $label = shift; + my $i = shift; + my $load_rk10 = shift; + + if($compare == 1) { +$code.=<<___; + cmp x16,#12 +___ + } +$code.=<<___; + b.lt .Laes128_${label}_$i +.Laes192_${label}_$i: + ldp q30,q31,[x17],32 /* rk[10],rk[11] */ + aese v$i.16b,v17.16b + aesmc v$i.16b,v$i.16b + aese v$i.16b,v30.16b + aesmc v$i.16b,v$i.16b + b.gt .Laes256_${label}_$i + ld1 {v30.16b},[x17] /* rk[12] */ + aese v$i.16b,v31.16b + eor v$i.16b,v$i.16b,v30.16b + sub x17, x17, #32 /* rewind x17 */ + b 1f +.Laes256_${label}_$i: + aese v$i.16b,v31.16b + aesmc v$i.16b,v$i.16b + ldp q30,q31,[x17],32 /* rk[12],rk[13] */ + aese v$i.16b,v30.16b + aesmc v$i.16b,v$i.16b + ld1 {v30.16b},[x17] /* rk[14] */ + aese v$i.16b,v31.16b + eor v$i.16b,v$i.16b,v30.16b + sub x17, x17, #64 /* rewind x17 */ + b 1f +.Laes128_${label}_$i: +___ + if ($load_rk10 == 1) { +$code.=<<___; + ld1 {v18.16b},[x9] +___ + } +$code.=<<___; + aese v$i.16b,v17.16b + eor v$i.16b,v$i.16b,v18.16b /* res 0 */ +1: +___ +} + +sub aes192_aes256_dec_handle () { + my $compare = shift; + my $label = shift; + my $i = shift; + my $load_rk10 = shift; + + if($compare == 1) { +$code.=<<___; + cmp x16,#12 +___ + } +$code.=<<___; + b.lt .Laes128_${label}_$i +.Laes192_${label}_$i: + stp q19,q23,[sp, #-32]! + ld1 {v19.16b},[x17],16 /* rk[10] */ + ld1 {v23.16b},[x17],16 /* rk[11] */ + aesd v$i.16b,v17.16b + aesimc v$i.16b,v$i.16b + aesd v$i.16b,v19.16b + aesimc v$i.16b,v$i.16b + b.gt .Laes256_${label}_$i + ld1 {v19.16b},[x17] /* rk[12] */ + aesd v$i.16b,v23.16b + eor v$i.16b,v$i.16b,v19.16b + sub x17, x17, #32 /* rewind x17 */ + ldp q19,q23,[sp], #32 + b 1f +.Laes256_${label}_$i: + aesd v$i.16b,v23.16b + aesimc v$i.16b,v$i.16b + ld1 {v19.16b},[x17],16 /* rk[12] */ + ld1 {v23.16b},[x17],16 /* rk[13] */ + aesd v$i.16b,v19.16b + aesimc v$i.16b,v$i.16b + ld1 {v19.16b},[x17] /* rk[14] */ + aesd v$i.16b,v23.16b + eor v$i.16b,v$i.16b,v19.16b + sub x17, x17, #64 /* rewind x17 */ + ldp q19,q23,[sp], #32 + b 1f +.Laes128_${label}_$i: +___ + if ($load_rk10 == 1) { +$code.=<<___; + ld1 {v18.16b},[x9] +___ + } +$code.=<<___; + aesd v$i.16b,v17.16b + eor v$i.16b,v$i.16b,v18.16b /* res 0 */ +1: +___ +} + +$code.=<<___; +# Description: +# +# Combined Enc/Auth Primitive = aes128cbc/sha256_hmac +# +# Operations: +# +# out = encrypt-AES128CBC(in) +# return_hash_ptr = SHA256(o_key_pad | SHA256(i_key_pad | out)) +# +# Prototype: +# void asm_aescbc_sha256_hmac(uint8_t *csrc, uint8_t *cdst, uint64_t clen, +# uint8_t *dsrc, uint8_t *ddst, uint64_t dlen, +# CIPH_DIGEST *arg) +# +# Registers used: +# +# asm_aescbc_sha256_hmac( +# csrc, x0 (cipher src address) +# cdst, x1 (cipher dst address) +# clen x2 (cipher length) +# dsrc, x3 (digest src address) +# ddst, x4 (digest dst address) +# dlen, x5 (digest length) +# arg x6 : +# arg->cipher.key (round keys) +# arg->cipher.key_rounds (key rounds) +# arg->cipher.iv (initialization vector) +# arg->digest.hmac.i_key_pad (partially hashed i_key_pad) +# arg->digest.hmac.o_key_pad (partially hashed o_key_pad) +# ) +# +# Routine register definitions: +# +# v0 -- v3 -- aes results +# v4 -- v7 -- round consts for sha +# v8 -- v18 -- round keys +# v19 -- v20 -- round keys +# v21 -- ABCD tmp +# v22 -- sha working state ABCD (q22) +# v23 -- sha working state EFGH (q23) +# v24 -- sha state ABCD +# v25 -- sha state EFGH +# v26 -- sha block 0 +# v27 -- sha block 1 +# v28 -- sha block 2 +# v29 -- sha block 3 +# v30 -- reserved +# v31 -- reserved +# +# Constraints: +# +# The variable "clen" must be a multiple of 16, otherwise results +# are not defined. For AES partial blocks the user is required +# to pad the input to modulus 16 = 0. +# The variable "dlen" must be a multiple of 8 and greater or equal +# to "clen". This constrain is strictly related to the needs of the IPSec +# ESP packet. Encrypted payload is hashed along with the 8 byte ESP header, +# forming ICV. Speed gain is achieved by doing both things at the same time, +# hence lengths are required to match at least at the cipher level. +# +# Short lengths are not optimized at < 12 AES blocks + +.global asm_aescbc_sha256_hmac +.type asm_aescbc_sha256_hmac,%function + +.align 4 +.Lrcon: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + +.Linit_sha_state: + .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a + .word 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + +asm_aescbc_sha256_hmac: + AARCH64_VALID_CALL_TARGET + /* protect registers */ + stp d8,d9,[sp,#-64]! + /* fetch args */ + ldr x7, [x6, #HMAC_IKEYPAD] + /* init ABCD, EFGH. */ + ldp q24,q25,[x7] + /* save pointer to o_key_pad partial hash */ + ldr x7, [x6, #HMAC_OKEYPAD] + + stp d10,d11,[sp,#16] + + /* address of sha init state consts */ + adr x12,.Linit_sha_state + prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */ + lsr x10,x2,4 /* aes_blocks = len/16 */ + + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + + ldr x9, [x6, #CIPHER_KEY] + ldr x16, [x6, #CIPHER_KEY_ROUNDS] + ldr x6, [x6, #CIPHER_IV] + add x17, x9, #160 /* point to the last 5 rounds keys */ + + /* + * Init sha state, prefetch, check for small cases. + * Note that the output is prefetched as a load, for the in-place case + */ + prfm PLDL1KEEP,[x0,0] /* pref next aes_ptr_in */ + cmp x10,12 /* no main loop if <12 */ + b.lt .Lenc_short_cases /* branch if < 12 */ + + /* proceed */ + ld1 {v3.16b},[x6] /* get 1st ivec */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0],16 + mov x11,x2 /* len -> x11 needed at end */ + lsr x12,x11,6 /* total_blocks */ + /* + * now we can do the loop prolog, 1st aes sequence of 4 blocks + */ + ld1 {v8.16b},[x9],16 /* rk[0] */ + ld1 {v9.16b},[x9],16 /* rk[1] */ + eor v0.16b,v0.16b,v3.16b /* xor w/ ivec (modeop) */ + ld1 {v10.16b},[x9],16 /* rk[2] */ + + /* aes xform 0 */ + aese v0.16b,v8.16b + aesmc v0.16b,v0.16b + prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */ + ld1 {v11.16b},[x9],16 /* rk[3] */ + aese v0.16b,v9.16b + aesmc v0.16b,v0.16b + prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */ + /* base address for sha round consts */ + adr x8,.Lrcon + ld1 {v12.16b},[x9],16 /* rk[4] */ + aese v0.16b,v10.16b + aesmc v0.16b,v0.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x0],16 + ld1 {v13.16b},[x9],16 /* rk[5] */ + aese v0.16b,v11.16b + aesmc v0.16b,v0.16b + ld1 {v14.16b},[x9],16 /* rk[6] */ + aese v0.16b,v12.16b + aesmc v0.16b,v0.16b + ld1 {v15.16b},[x9],16 /* rk[7] */ + aese v0.16b,v13.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x9],16 /* rk[8] */ + aese v0.16b,v14.16b + aesmc v0.16b,v0.16b + ld1 {v17.16b},[x9],16 /* rk[9] */ + aese v0.16b,v15.16b + aesmc v0.16b,v0.16b + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b +___ + &aes192_aes256_handle(1, "enc_prolog", 0, 1); +$code.=<<___; + eor v1.16b,v1.16b,v0.16b /* xor w/ ivec (modeop) */ + +/* aes xform 1 */ + aese v1.16b,v8.16b + aesmc v1.16b,v1.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x0],16 + aese v1.16b,v9.16b + aesmc v1.16b,v1.16b + prfm PLDL1KEEP,[x8,0*64] /* rcon */ + aese v1.16b,v10.16b + aesmc v1.16b,v1.16b + aese v1.16b,v11.16b + aesmc v1.16b,v1.16b + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + ld1 {v26.16b},[x3],16 + aese v1.16b,v12.16b + aesmc v1.16b,v1.16b + prfm PLDL1KEEP,[x8,2*64] /* rcon */ + aese v1.16b,v13.16b + aesmc v1.16b,v1.16b + aese v1.16b,v14.16b + aesmc v1.16b,v1.16b + prfm PLDL1KEEP,[x8,4*64] /* rcon */ + aese v1.16b,v15.16b + aesmc v1.16b,v1.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + prfm PLDL1KEEP,[x8,6*64] /* rcon */ +___ + &aes192_aes256_handle(0, "enc_prolog", 1, 0); +$code.=<<___; + prfm PLDL1KEEP,[x8,8*64] /* rcon */ + eor v2.16b,v2.16b,v1.16b /* xor w/ ivec (modeop) */ + + /* aes xform 2 */ + aese v2.16b,v8.16b + aesmc v2.16b,v2.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x0],16 + aese v2.16b,v9.16b + aesmc v2.16b,v2.16b + aese v2.16b,v10.16b + aesmc v2.16b,v2.16b + prfm PLDL1KEEP,[x8,10*64] /* rcon */ + aese v2.16b,v11.16b + aesmc v2.16b,v2.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + ld1 {v27.16b},[x3],16 + aese v2.16b,v12.16b + aesmc v2.16b,v2.16b + prfm PLDL1KEEP,[x8,12*64] /* rcon */ + aese v2.16b,v13.16b + aesmc v2.16b,v2.16b + aese v2.16b,v14.16b + aesmc v2.16b,v2.16b + prfm PLDL1KEEP,[x8,14*64] /* rcon */ + aese v2.16b,v15.16b + aesmc v2.16b,v2.16b + aese v2.16b,v16.16b + aesmc v2.16b,v2.16b +___ + &aes192_aes256_handle(0, "enc_prolog", 2, 0); +$code.=<<___; + eor v3.16b,v3.16b,v2.16b /* xor w/ivec (modeop) */ + + /* aes xform 3 */ + aese v3.16b,v8.16b + aesmc v3.16b,v3.16b + aese v3.16b,v9.16b + aesmc v3.16b,v3.16b + aese v3.16b,v10.16b + aesmc v3.16b,v3.16b + aese v3.16b,v11.16b + aesmc v3.16b,v3.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + ld1 {v28.16b},[x3],16 + aese v3.16b,v12.16b + aesmc v3.16b,v3.16b + aese v3.16b,v13.16b + aesmc v3.16b,v3.16b + aese v3.16b,v14.16b + aesmc v3.16b,v3.16b + aese v3.16b,v15.16b + aesmc v3.16b,v3.16b + aese v3.16b,v16.16b + aesmc v3.16b,v3.16b + sub x15,x12,1 /* main_blocks = total_blocks - 1 */ + and x13,x10,3 /* aes_blocks_left */ +___ + &aes192_aes256_handle(0, "enc_prolog", 3, 0); +$code.=<<___; + /* + * Note, aes_blocks_left := number after the main (sha) + * block is done. Can be 0 + */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + ld1 {v29.16b},[x3],16 + + /* get outstanding bytes of the digest */ + sub x12,x5,x2 + /* substract loaded bytes */ + sub x5,x5,64 + + /* + * main combined loop CBC + */ +.Lenc_main_loop: + /* base address for sha round consts */ + adr x8,.Lrcon + /* + * Because both mov, rev32 and eor have a busy cycle,this takes longer + * than it looks. That's OK since there are 6 cycles before we can use + * the load anyway; so this goes as fast as it can without SW + * pipelining(too complicated given the code size) + */ + rev32 v26.16b,v26.16b + /* next aes block, update aes_ptr_in */ + ld1 {v0.16b},[x0],16 + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + prfm PLDL1KEEP,[x9,64] /* pref next lead_ptr */ + rev32 v27.16b,v27.16b + /* pref next aes_ptr_out, streaming */ + prfm PLDL1KEEP,[x1,64] + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + ld1 {v4.16b},[x8],16 /* key0 */ + eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */ + ld1 {v5.16b},[x8],16 /* key1 */ + /* + * aes xform 0, sha quad 0 + */ + aese v0.16b,v8.16b + aesmc v0.16b,v0.16b + ld1 {v6.16b},[x8],16 /* key2 */ + rev32 v28.16b,v28.16b + ld1 {v7.16b},[x8],16 /* key3 */ + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x0],16 + aese v0.16b,v9.16b + aesmc v0.16b,v0.16b + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su0 v26.4s,v27.4s + aese v0.16b,v10.16b + aesmc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aese v0.16b,v11.16b + aesmc v0.16b,v0.16b + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + /* no place to get rid of this stall */ + rev32 v29.16b,v29.16b + sha256h2 q23, q21, v4.4s + aese v0.16b,v12.16b + aesmc v0.16b,v0.16b + sha256su1 v26.4s,v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + ld1 {v4.16b},[x8],16 /* key4 */ + sha256su0 v27.4s,v28.4s + aese v0.16b,v13.16b + aesmc v0.16b,v0.16b + sha256h q22, q23, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256h2 q23, q21, v5.4s + aese v0.16b,v14.16b + aesmc v0.16b,v0.16b + ld1 {v5.16b},[x8],16 /* key5 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + sha256su0 v28.4s,v29.4s + aese v0.16b,v15.16b + aesmc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + sha256su1 v28.4s,v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v29.4s,v26.4s + sha256h q22, q23, v7.4s +___ + &aes192_aes256_handle(1, "enc_mainloop", 0, 0); +$code.=<<___; + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + + /* aes xform 1, sha quad 1 */ + sha256su0 v26.4s,v27.4s + eor v1.16b,v1.16b,v0.16b /* mode op 1 xor w/prev value */ + ld1 {v7.16b},[x8],16 /* key7 */ + mov v21.16b, v22.16b /* copy abcd */ + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + aese v1.16b,v8.16b + aesmc v1.16b,v1.16b + sha256h q22, q23, v4.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + aese v1.16b,v9.16b + aesmc v1.16b,v1.16b + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + aese v1.16b,v10.16b + aesmc v1.16b,v1.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x0],16 + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + ld1 {v4.16b},[x8],16 /* key4 */ + aese v1.16b,v11.16b + aesmc v1.16b,v1.16b + ld1 {v5.16b},[x8],16 /* key5 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v28.4s,v29.4s + sha256h q22, q23, v6.4s + aese v1.16b,v12.16b + aesmc v1.16b,v1.16b + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x8],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + sha256su0 v29.4s,v26.4s + aese v1.16b,v13.16b + aesmc v1.16b,v1.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + aese v1.16b,v14.16b + aesmc v1.16b,v1.16b + ld1 {v7.16b},[x8],16 /* key7 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + aese v1.16b,v15.16b + aesmc v1.16b,v1.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ +___ + &aes192_aes256_handle(0, "enc_mainloop", 1, 0); +$code.=<<___; + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + + /* mode op 2 */ + eor v2.16b,v2.16b,v1.16b /* mode of 2 xor w/prev value */ + + /* aes xform 2, sha quad 2 */ + sha256su0 v26.4s,v27.4s + aese v2.16b,v8.16b + aesmc v2.16b,v2.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + aese v2.16b,v9.16b + aesmc v2.16b,v2.16b + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v4.16b},[x8],16 /* key4 */ + sha256su0 v27.4s,v28.4s + aese v2.16b,v10.16b + aesmc v2.16b,v2.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + aese v2.16b,v11.16b + aesmc v2.16b,v2.16b + sha256su1 v27.4s,v29.4s,v26.4s + ld1 {v5.16b},[x8],16 /* key5 */ + sha256su0 v28.4s,v29.4s + aese v2.16b,v12.16b + aesmc v2.16b,v2.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + aese v2.16b,v13.16b + aesmc v2.16b,v2.16b + sha256su1 v28.4s,v26.4s,v27.4s + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su0 v29.4s,v26.4s + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x0],16 + aese v2.16b,v14.16b + aesmc v2.16b,v2.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + aese v2.16b,v15.16b + aesmc v2.16b,v2.16b + sha256su1 v29.4s,v27.4s,v28.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + ld1 {v6.16b},[x8],16 /* key6 */ + ld1 {v7.16b},[x8],16 /* key7 */ + aese v2.16b,v16.16b + aesmc v2.16b,v2.16b +___ + &aes192_aes256_handle(0, "enc_mainloop", 2, 0); +$code.=<<___; + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + + /* mode op 3 */ + eor v3.16b,v3.16b,v2.16b /* xor w/prev value */ + + /* aes xform 3, sha quad 3 (hash only) */ + aese v3.16b,v8.16b + aesmc v3.16b,v3.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + aese v3.16b,v9.16b + aesmc v3.16b,v3.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + aese v3.16b,v10.16b + aesmc v3.16b,v3.16b + aese v3.16b,v11.16b + aesmc v3.16b,v3.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + aese v3.16b,v12.16b + aesmc v3.16b,v3.16b + aese v3.16b,v13.16b + aesmc v3.16b,v3.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + aese v3.16b,v14.16b + aesmc v3.16b,v3.16b + sub x15,x15,1 /* dec block count */ + aese v3.16b,v15.16b + aesmc v3.16b,v3.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + aese v3.16b,v16.16b + aesmc v3.16b,v3.16b +___ + &aes192_aes256_handle(0, "enc_mainloop", 3, 0); +$code.=<<___; + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + + ldp q26,q27,[x3],32 + ldp q28,q29,[x3],32 + sub x5,x5,64 + + cbnz x15,.Lenc_main_loop /* loop if more to do */ + + mov w15,0x80 /* that's the 1 of the pad */ + /* + * epilog, process remaining aes blocks and b-2 sha block + * do this inline (no loop) to overlap with the sha part + * note there are 0-3 aes blocks left. + */ + rev32 v26.16b,v26.16b /* fix endian w0 */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + rev32 v28.16b,v28.16b /* fix endian w2 */ + rev32 v29.16b,v29.16b /* fix endian w3 */ + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + cbz x13, .Lbm2fromQ0 /* skip if none left */ + + /* + * mode op 0 + * read next aes block, update aes_ptr_in + */ + ld1 {v0.16b},[x0],16 + /* base address for sha round consts */ + adr x8,.Lrcon + ld1 {v4.16b},[x8],16 /* key0 */ + ld1 {v5.16b},[x8],16 /* key1 */ + ld1 {v6.16b},[x8],16 /* key2 */ + ld1 {v7.16b},[x8],16 /* key3 */ + eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */ + + /* aes xform 0, sha quad 0 */ + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + aese v0.16b,v8.16b + aesmc v0.16b,v0.16b + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256su0 v26.4s,v27.4s + aese v0.16b,v9.16b + aesmc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + aese v0.16b,v10.16b + aesmc v0.16b,v0.16b + sha256su1 v26.4s,v28.4s,v29.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su0 v27.4s,v28.4s + aese v0.16b,v11.16b + aesmc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + aese v0.16b,v12.16b + aesmc v0.16b,v0.16b + sha256su1 v27.4s,v29.4s,v26.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + aese v0.16b,v13.16b + aesmc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + aese v0.16b,v14.16b + aesmc v0.16b,v0.16b + sha256su1 v28.4s,v26.4s,v27.4s + sha256su0 v29.4s,v26.4s + aese v0.16b,v15.16b + aesmc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s +___ + &aes192_aes256_handle(1, "enc_epilog", 0, 0); +$code.=<<___; + subs x14,x13,1 /* local copy of aes_blocks_left */ + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + /* if aes_blocks_left_count == 0 */ + beq .Lbm2fromQ1 + /* + * mode op 1 + * read next aes block, update aes_ptr_in + */ + ld1 {v1.16b},[x0],16 + ld1 {v4.16b},[x8],16 /* key4 */ + ld1 {v5.16b},[x8],16 /* key5 */ + ld1 {v6.16b},[x8],16 /* key6 */ + ld1 {v7.16b},[x8],16 /* key7 */ + + eor v1.16b,v1.16b,v0.16b /* xor w/prev value */ + + /* aes xform 1, sha quad 1 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + aese v1.16b,v8.16b + aesmc v1.16b,v1.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su0 v26.4s,v27.4s + aese v1.16b,v9.16b + aesmc v1.16b,v1.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + aese v1.16b,v10.16b + aesmc v1.16b,v1.16b + sha256su1 v26.4s,v28.4s,v29.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su0 v27.4s,v28.4s + aese v1.16b,v11.16b + aesmc v1.16b,v1.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + aese v1.16b,v12.16b + aesmc v1.16b,v1.16b + sha256su1 v27.4s,v29.4s,v26.4s + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + aese v1.16b,v13.16b + aesmc v1.16b,v1.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + aese v1.16b,v14.16b + aesmc v1.16b,v1.16b + sha256su1 v28.4s,v26.4s,v27.4s + sha256su0 v29.4s,v26.4s + aese v1.16b,v15.16b + aesmc v1.16b,v1.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + sha256su1 v29.4s,v27.4s,v28.4s +___ + &aes192_aes256_handle(1, "enc_epilog", 1, 0); +$code.=<<___; + subs x14,x14,1 /* dec counter */ + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + /* if aes_blocks_left_count == 0 */ + beq .Lbm2fromQ2 + /* + * mode op 2 + * read next aes block, update aes_ptr_in + */ + ld1 {v2.16b},[x0],16 + ld1 {v4.16b},[x8],16 /* key4 */ + ld1 {v5.16b},[x8],16 /* key5 */ + ld1 {v6.16b},[x8],16 /* key6 */ + ld1 {v7.16b},[x8],16 /* key7 */ + eor v2.16b,v2.16b,v1.16b /* xor w/prev value */ + + /* aes xform 2, sha quad 2 */ + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + aese v2.16b,v8.16b + aesmc v2.16b,v2.16b + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256su0 v26.4s,v27.4s + aese v2.16b,v9.16b + aesmc v2.16b,v2.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + aese v2.16b,v10.16b + aesmc v2.16b,v2.16b + sha256su1 v26.4s,v28.4s,v29.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su0 v27.4s,v28.4s + aese v2.16b,v11.16b + aesmc v2.16b,v2.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + aese v2.16b,v12.16b + aesmc v2.16b,v2.16b + sha256su1 v27.4s,v29.4s,v26.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + aese v2.16b,v13.16b + aesmc v2.16b,v2.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + aese v2.16b,v14.16b + aesmc v2.16b,v2.16b + sha256su1 v28.4s,v26.4s,v27.4s + sha256su0 v29.4s,v26.4s + aese v2.16b,v15.16b + aesmc v2.16b,v2.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aese v2.16b,v16.16b + aesmc v2.16b,v2.16b + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s +___ + &aes192_aes256_handle(1, "enc_epilog", 2, 0); +$code.=<<___; + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + /* join common code at Quad 3 */ + b .Lbm2fromQ3 +/* + * Now there is the b-2 sha block before the final one. Execution takes over + * in the appropriate part of this depending on how many aes blocks were left. + * If there were none, the whole thing is executed. + */ +/* quad 0 */ +.Lbm2fromQ0: + /* base address for sha round consts */ + adr x8,.Lrcon + + ld1 {v4.16b},[x8],16 /* key0 */ + ld1 {v5.16b},[x8],16 /* key1 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key2 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key3 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 1 */ +.Lbm2fromQ1: + ld1 {v4.16b},[x8],16 /* key4 */ + ld1 {v5.16b},[x8],16 /* key5 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 2 */ +.Lbm2fromQ2: + ld1 {v4.16b},[x8],16 /* key4 */ + ld1 {v5.16b},[x8],16 /* key5 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 3 */ +.Lbm2fromQ3: + ld1 {v4.16b},[x8],16 /* key4 */ + ld1 {v5.16b},[x8],16 /* key5 */ + ld1 {v6.16b},[x8],16 /* key6 */ + ld1 {v7.16b},[x8],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + eor v26.16b,v26.16b,v26.16b /* zero reg */ + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + eor v27.16b,v27.16b,v27.16b /* zero reg */ + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + eor v28.16b,v28.16b,v28.16b /* zero reg */ + sha256h2 q23, q21, v7.4s + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + + /* Process remaining 0-3 AES blocks here */ + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ + + cbz x13,.Lpost_long_Q0 + + /* 1st remaining AES block */ + ld1 {v26.16b},[x3],16 + sub x5,x5,16 + rev32 v26.16b,v26.16b + subs x14,x13,1 + b.eq .Lpost_long_Q1 + + /* 2nd remaining AES block */ + ld1 {v27.16b},[x3],16 + sub x5,x5,16 + rev32 v27.16b,v27.16b + subs x14,x14,1 + b.eq .Lpost_long_Q2 + + /* 3rd remaining AES block */ + ld1 {v28.16b},[x3],16 + sub x5,x5,16 + rev32 v28.16b,v28.16b + /* Allow for filling this sha256 block with the remaining digest src */ + b .Lpost_long_Q3 +/* + * Process remaining 8B blocks of the digest + */ +.Lpost_long_Q0: +/* blk 0,1 */ + /* assume final block */ + mov v26.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_long_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v26 value (0x80) */ + mov v26.d[0],x2 + /* assume this was final block */ + mov v26.b[11],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_long_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + mov v26.d[1],x2 + +.Lpost_long_Q1: +/* blk 2,3 */ + /* assume this is final block */ + mov v27.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_long_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v27 value (0x80) */ + mov v27.d[0],x2 + /* assume this was final block */ + mov v27.b[11],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_long_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + mov v27.d[1],x2 + +.Lpost_long_Q2: +/* blk 4,5 */ + /* assume this was final block */ + mov v28.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_long_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v28 value (0x80) */ + mov v28.d[0],x2 + /* assume this was final block */ + mov v28.b[11],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_long_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + mov v28.d[1],x2 + +.Lpost_long_Q3: +/* blk 6,7 */ + /* assume this was final block */ + mov v29.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_long_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v29 value (0x80) */ + mov v29.d[0],x2 + /* assume this was final block */ + mov v29.b[11],w15 + /* + * Outstanding 8B blocks left. + * Since there has to be another sha block with padding, + * we need to calculate hash without padding here. + */ + cbz x5,1f + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + rev32 x2,x2 + /* + * Don't decrease x5 here. + * Use it to indicate necessity of constructing "1" padding at the end. + */ + mov v29.d[1],x2 + +/* + * That is enough of blocks, we allow up to 64 bytes in total. + * Now we have the sha256 to do for these 4 16B blocks + */ +1: + /* base address for sha round consts */ + adr x8,.Lrcon + + ld1 {v4.16b},[x8],16 /* key0 */ + ld1 {v5.16b},[x8],16 /* key1 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + + sha256su0 v26.4s,v27.4s + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key2 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key3 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + mov v21.16b, v22.16b /* copy abcd */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */ + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */ + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ + + /* this was final block */ + cbz x5,.Lpost_long_loop + subs x5,x5,8 + /* loop if hash is not finished */ + b.ne .Lpost_long_Q0 + /* set "1" of the padding if this was a final block */ + mov v26.b[3],w15 + +.Lpost_long_loop: + /* Add outstanding bytes of digest source */ + add x11,x11,x12 + /* Add one SHA-256 block since hash is calculated including i_key_pad */ + add x11,x11, #64 + lsr x12,x11,32 /* len_hi */ + and x13,x11,0xffffffff /* len_lo */ + lsl x12,x12,3 /* len_hi in bits */ + lsl x13,x13,3 /* len_lo in bits */ + + mov v29.s[3],w13 /* len_lo */ + mov v29.s[2],w12 /* len_hi */ + + /* + * do last sha of pad block + */ + /* base address for sha round consts */ + adr x8,.Lrcon + + /* quad 0 */ + ld1 {v4.16b},[x8],16 /* key0 */ + ld1 {v5.16b},[x8],16 /* key1 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + + sha256su0 v26.4s,v27.4s + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key2 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key3 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 1 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 2 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 3 */ + ldp q6,q7,[x8],32 /* key6,key7 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + mov v21.16b, v22.16b /* copy abcd */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + sha256h q22, q23, v4.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + add v26.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v27.4s,v25.4s,v23.4s /* EFGH += working copy */ + + /* Calculate final HMAC */ + eor v28.16b, v28.16b, v28.16b + eor v29.16b, v29.16b, v29.16b + /* base address for sha round consts */ + adr x8,.Lrcon + /* load o_key_pad partial hash */ + ldp q24,q25,[x7] + + /* Set padding 1 to the first reg */ + mov w11, #0x80 /* that's the 1 of the pad */ + mov v28.b[3], w11 + /* size of o_key_pad + inner hash */ + mov x11, #64+32 + lsl x11, x11, 3 + /* move length to the end of the block */ + mov v29.s[3], w11 + ldp q4,q5,[x8],32 /* key0,key1 */ + lsr x11, x11, 32 + mov v29.s[2], w11 /* and the higher part */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + + sha256su0 v26.4s,v27.4s + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key2 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key3 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + ldp q6,q7,[x8],32 /* key6,key7 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + mov v21.16b, v22.16b /* copy abcd */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + sha256h q22, q23, v4.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + ldp d14,d15,[sp,#48] + ldp d8,d9,[sp],#64 + + mov x0, xzr + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + + rev32 v24.16b, v24.16b + rev32 v25.16b, v25.16b + + stp q24,q25,[x4] /* save them both */ + + ret + +/* + * These are the short cases (less efficient), here used for 1-11 aes blocks. + * x10 = aes_blocks + */ +.Lenc_short_cases: + ld1 {v3.16b},[x6] /* get ivec */ + ldp q8,q9,[x9],32 /* rk[0-1] */ + eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */ + ldp q10,q11,[x9],32 /* rk[2-3] */ + eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */ + ldp q12,q13,[x9],32 /* rk[3-4] */ + eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */ + ldp q14,q15,[x9],32 /* rk[5-6] */ + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ + ldp q16,q17,[x9],32 /* rk[7-8] */ + mov w15,0x80 /* sha padding word */ + lsl x11,x10,4 /* len = aes_blocks*16 */ + ld1 {v18.16b},[x9] /* rk[9] */ + + /* get outstanding bytes of the digest */ + sub x12,x5,x2 +/* + * the idea in the short loop (at least 1) is to break out with the padding + * already in place excepting the final word. + */ +.Lenc_short_loop: + adr x8,.Lrcon /* rcon */ + /* read next aes block, update aes_ptr_in */ + ld1 {v0.16b},[x0],16 + eor v0.16b,v0.16b,v3.16b /* xor w/prev value */ + + /* aes xform 0 */ + aese v0.16b,v8.16b + aesmc v0.16b,v0.16b + aese v0.16b,v9.16b + aesmc v0.16b,v0.16b + aese v0.16b,v10.16b + aesmc v0.16b,v0.16b + aese v0.16b,v11.16b + aesmc v0.16b,v0.16b + aese v0.16b,v12.16b + aesmc v0.16b,v0.16b + aese v0.16b,v13.16b + aesmc v0.16b,v0.16b + aese v0.16b,v14.16b + aesmc v0.16b,v0.16b + aese v0.16b,v15.16b + aesmc v0.16b,v0.16b + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b +___ + &aes192_aes256_handle(1, "enc_short", 0, 1); +$code.=<<___; + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + /* load next 16 bytes for SHA-256 */ + ld1 {v26.16b},[x3],16 + /* dec number of bytes of the hash input */ + sub x5,x5,16 + rev32 v26.16b,v26.16b /* load res to sha 0, endian swap */ + sub x10,x10,1 /* dec num_blocks */ + cbz x10,.Lpost_short_Q1 /* break if no more */ + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x0],16 + eor v1.16b,v1.16b,v0.16b /* xor w/prev value */ + + /* aes xform 1 */ + aese v1.16b,v8.16b + aesmc v1.16b,v1.16b + aese v1.16b,v9.16b + aesmc v1.16b,v1.16b + aese v1.16b,v10.16b + aesmc v1.16b,v1.16b + aese v1.16b,v11.16b + aesmc v1.16b,v1.16b + aese v1.16b,v12.16b + aesmc v1.16b,v1.16b + aese v1.16b,v13.16b + aesmc v1.16b,v1.16b + aese v1.16b,v14.16b + aesmc v1.16b,v1.16b + aese v1.16b,v15.16b + aesmc v1.16b,v1.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b +___ + &aes192_aes256_handle(1, "enc_short", 1, 0); +$code.=<<___; + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + /* load next 16 bytes for SHA-256 */ + ld1 {v27.16b},[x3],16 + /* dec number of bytes of the hash input */ + sub x5,x5,16 + rev32 v27.16b,v27.16b /* load res to sha 0, endian swap */ + sub x10,x10,1 /* dec num_blocks */ + cbz x10,.Lpost_short_Q2 /* break if no more */ + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x0],16 + eor v2.16b,v2.16b,v1.16b /* xor w/prev value */ + + /* aes xform 2 */ + aese v2.16b,v8.16b + aesmc v2.16b,v2.16b + aese v2.16b,v9.16b + aesmc v2.16b,v2.16b + aese v2.16b,v10.16b + aesmc v2.16b,v2.16b + aese v2.16b,v11.16b + aesmc v2.16b,v2.16b + aese v2.16b,v12.16b + aesmc v2.16b,v2.16b + aese v2.16b,v13.16b + aesmc v2.16b,v2.16b + aese v2.16b,v14.16b + aesmc v2.16b,v2.16b + aese v2.16b,v15.16b + aesmc v2.16b,v2.16b + aese v2.16b,v16.16b + aesmc v2.16b,v2.16b +___ + &aes192_aes256_handle(1, "enc_short", 2, 0); +$code.=<<___; + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + /* load next 16 bytes for SHA-256 */ + ld1 {v28.16b},[x3],16 + /* dec number of bytes of the hash input */ + sub x5,x5,16 + rev32 v28.16b,v28.16b /* load res to sha 0, endian swap */ + sub x10,x10,1 /* dec num_blocks */ + cbz x10,.Lpost_short_Q3 /* break if no more */ + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x0],16 + eor v3.16b,v3.16b,v2.16b /* xor w/ prev value */ + + /* aes xform 3 */ + aese v3.16b,v8.16b + aesmc v3.16b,v3.16b + aese v3.16b,v9.16b + aesmc v3.16b,v3.16b + aese v3.16b,v10.16b + aesmc v3.16b,v3.16b + aese v3.16b,v11.16b + aesmc v3.16b,v3.16b + aese v3.16b,v12.16b + aesmc v3.16b,v3.16b + aese v3.16b,v13.16b + aesmc v3.16b,v3.16b + aese v3.16b,v14.16b + aesmc v3.16b,v3.16b + aese v3.16b,v15.16b + aesmc v3.16b,v3.16b + aese v3.16b,v16.16b + aesmc v3.16b,v3.16b +___ + &aes192_aes256_handle(1, "enc_short", 3, 0); +$code.=<<___; + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + /* load next 16 bytes for SHA-256 */ + ld1 {v29.16b},[x3],16 + /* dec number of bytes of the hash input */ + sub x5,x5,16 + /* load res to sha 0, endian swap */ + rev32 v29.16b,v29.16b + /* + * now we have the sha256 to do for these 4 aes blocks + */ + + /* quad 0 */ + ld1 {v4.16b},[x8],16 /* key0 */ + ld1 {v5.16b},[x8],16 /* key1 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + + sha256su0 v26.4s,v27.4s + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key2 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key3 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 1 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 2 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 3 */ + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + mov v21.16b, v22.16b /* copy abcd */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */ + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */ + sub x10,x10,1 /* dec num_blocks */ + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ + + cbnz x10,.Lenc_short_loop /* keep looping if more */ + +.Lpost_short_Q0: + /* assume this was final block */ + mov v26.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_short_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v26 value (0x80) */ + mov v26.d[0],x2 + /* assume this was final block */ + mov v26.b[11],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_short_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + mov v26.d[1],x2 +.Lpost_short_Q1: + /* zero out vectors */ + eor v27.16b,v27.16b,v27.16b + eor v28.16b,v28.16b,v28.16b + eor v29.16b,v29.16b,v29.16b + /* assume this is final block */ + mov v27.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_short_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v27 value (0x80) */ + mov v27.d[0],x2 + /* assume this was final block */ + mov v27.b[11],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_short_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + mov v27.d[1],x2 +.Lpost_short_Q2: + /* zero out vectors (repeated if came from Q0) */ + eor v28.16b,v28.16b,v28.16b + eor v29.16b,v29.16b,v29.16b + /* assume this was final block */ + mov v28.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_short_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v28 value (0x80) */ + mov v28.d[0],x2 + /* assume this was final block */ + mov v28.b[11],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_short_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + mov v28.d[1],x2 +.Lpost_short_Q3: + /* zero out vector (repeated if came from Q1) */ + eor v29.16b,v29.16b,v29.16b + /* assume this was final block */ + mov v29.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_short_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v29 value (0x80) */ + mov v29.d[0],x2 + /* assume this was final block */ + mov v29.b[11],w15 + /* outstanding 8B blocks left */ + cbz x5,1f + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + rev32 x2,x2 + mov v29.d[1],x2 + +/* + * That is enough of blocks, we allow up to 64 bytes in total. + * Now we have the sha256 to do for these 4 16B blocks + */ +1: + /* base address for sha round consts */ + adr x8,.Lrcon + + ld1 {v4.16b},[x8],16 /* key0 */ + ld1 {v5.16b},[x8],16 /* key1 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + + sha256su0 v26.4s,v27.4s + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key2 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key3 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + mov v21.16b, v22.16b /* copy abcd */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */ + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */ + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ + + /* this was final block */ + cbz x5,.Lpost_short_loop + subs x5,x5,8 + /* loop if hash is not finished */ + b.ne .Lpost_short_Q0 + /* set "1" of the padding if this was a final block */ + mov v26.b[3],w15 + +/* + * there are between 0 and 3 aes blocks in the final sha256 blocks + */ +.Lpost_short_loop: + /* Add outstanding bytes of digest source */ + add x11,x11,x12 + /* Add one SHA-256 block since hash is calculated including i_key_pad */ + add x11,x11, #64 + lsr x12,x11,32 /* len_hi */ + and x13,x11,0xffffffff /* len_lo */ + lsl x12,x12,3 /* len_hi in bits */ + lsl x13,x13,3 /* len_lo in bits */ + + mov v29.s[3],w13 /* len_lo */ + mov v29.s[2],w12 /* len_hi */ + + /* do final block */ + + /* base address for sha round consts */ + adr x8,.Lrcon /* top of rcon */ + + /* quad 0 */ + ld1 {v4.16b},[x8],16 /* key0 */ + ld1 {v5.16b},[x8],16 /* key1 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + + sha256su0 v26.4s,v27.4s + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key2 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key3 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 1 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 2 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 3 */ + ldp q6,q7,[x8],32 /* key6,key7 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + mov v21.16b, v22.16b /* copy abcd */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + sha256h q22, q23, v4.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + add v26.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v27.4s,v25.4s,v23.4s /* EFGH += working copy */ + + /* Calculate final HMAC */ + eor v28.16b, v28.16b, v28.16b + eor v29.16b, v29.16b, v29.16b + /* base address for sha round consts */ + adr x8,.Lrcon + /* load o_key_pad partial hash */ + ldp q24,q25,[x7] + + /* Set padding 1 to the first reg */ + mov w11, #0x80 /* that's the 1 of the pad */ + mov v28.b[3], w11 + /* size of o_key_pad + inner hash */ + mov x11, #64+32 + lsl x11, x11, 3 + /* move length to the end of the block */ + mov v29.s[3], w11 + ldp q4,q5,[x8],32 /* key0,key1 */ + lsr x11, x11, 32 + mov v29.s[2], w11 /* and the higher part */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + + sha256su0 v26.4s,v27.4s + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key2 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key3 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + ldp q6,q7,[x8],32 /* key6,key7 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + mov v21.16b, v22.16b /* copy abcd */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + sha256h q22, q23, v4.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + ldp d14,d15,[sp,#48] + ldp d8,d9,[sp],#64 + + mov x0, xzr + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + + rev32 v24.16b, v24.16b + rev32 v25.16b, v25.16b + + stp q24,q25,[x4] /* save them both */ + + ret + +.size asm_aescbc_sha256_hmac, .-asm_aescbc_sha256_hmac + +# Description: +# +# Combined Auth/Dec Primitive = sha256_hmac/aes128cbc +# +# Operations: +# +# out = decrypt-AES128CBC(in) +# return_ash_ptr = SHA256(o_key_pad | SHA256(i_key_pad | in)) +# +# Prototype: +# +# void asm_sha256_hmac_aescbc_dec(uint8_t *csrc, uint8_t *cdst, uint64_t clen, +# uint8_t *dsrc, uint8_t *ddst, uint64_t dlen, +# CIPH_DIGEST *arg) +# +# Registers used: +# +# asm_sha256_hmac_aescbc_dec( +# csrc, x0 (cipher src address) +# cdst, x1 (cipher dst address) +# clen x2 (cipher length) +# dsrc, x3 (digest src address) +# ddst, x4 (digest dst address) +# dlen, x5 (digest length) +# arg x6: +# arg->cipher.key (round keys) +# arg->cipher.key_rounds (key rounds) +# arg->cipher.iv (initialization vector) +# arg->digest.hmac.i_key_pad (partially hashed i_key_pad) +# arg->digest.hmac.o_key_pad (partially hashed o_key_pad) +# ) +# +# Routine register definitions: +# +# v0 - v3 -- aes results +# v4 - v7 -- round consts for sha +# v8 - v18 -- round keys +# v19 - v20 -- round keys +# v21 -- ABCD tmp +# v22 -- sha working state ABCD (q22) +# v23 -- sha working state EFGH (q23) +# v24 -- sha state ABCD +# v25 -- sha state EFGH +# v26 -- sha block 0 +# v27 -- sha block 1 +# v28 -- sha block 2 +# v29 -- sha block 3 +# v30 -- reserved +# v31 -- reserved +# +# +# Constraints: +# +# The variable "clen" must be a multiple of 16, otherwise results are not +# defined For AES partial blocks the user is required to pad the input to +# modulus 16 = 0. +# +# The variable "dlen" must be a multiple of 8 and greater or equal to "clen". +# The maximum difference between "dlen" and "clen" cannot exceed 64 bytes. +# This constrain is strictly related to the needs of the IPSec ESP packet. +# Short lengths are less optimized at < 16 AES blocks, however they are +# somewhat optimized, and more so than the enc/auth versions. + +.global asm_sha256_hmac_aescbc_dec +.type asm_sha256_hmac_aescbc_dec,%function + +asm_sha256_hmac_aescbc_dec: + AARCH64_VALID_CALL_TARGET + /* protect registers */ + stp d8,d9,[sp, #-80]! + + /* fetch args */ + ldr x7, [x6, #HMAC_IKEYPAD] + /* init ABCD, EFGH */ + ldp q24,q25,[x7] + /* save pointer to o_key_pad partial hash */ + ldr x7, [x6, #HMAC_OKEYPAD] + + stp d10,d11,[sp,#16] + + prfm PLDL1KEEP,[x0,0] /* pref next aes_ptr_in */ + stp d12,d13,[sp,#32] + prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */ + lsr x10,x2,4 /* aes_blocks = len/16 */ + stp d14,d15,[sp,#48] + /* address of sha init state consts */ + adr x12,.Linit_sha_state + stp x19,x20,[sp,#64] + + ldr x9, [x6, #CIPHER_KEY] + ldr x16, [x6, #CIPHER_KEY_ROUNDS] + ldr x6, [x6, #CIPHER_IV] + add x17, x9, #160 /* point to the last 5 rounds keys */ + /* + * Init sha state, prefetch, check for small cases. + * Note that the output is prefetched as a load, for the in-place case. + */ + cmp x10,16 /* no main loop if <16 */ + blt .Ldec_short_cases /* branch if < 12 */ + + /* get outstanding bytes of the digest */ + sub x20,x5,x2 + + mov x11,x2 /* len -> x11 needed at end */ + ld1 {v30.16b},[x6] /* get 1st ivec */ + lsr x12,x11,6 /* total_blocks (sha) */ + + ldp q26,q27,[x3],32 + rev32 v26.16b,v26.16b /* endian swap w0 */ + rev32 v27.16b,v27.16b /* endian swap w1 */ + ldp q28,q29,[x3],32 + rev32 v28.16b,v28.16b /* endian swap w2 */ + rev32 v29.16b,v29.16b /* endian swap w3 */ + + /* substract loaded bytes */ + sub x5,x5,64 + /* + * now we can do the loop prolog, 1st sha256 block + */ + prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */ + prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */ + /* base address for sha round consts */ + adr x8,.Lrcon + /* + * do the first sha256 block on the plaintext + */ + mov v22.16b,v24.16b /* init working ABCD */ + mov v23.16b,v25.16b /* init working EFGH */ + + /* quad 0 */ + ld1 {v4.16b},[x8],16 /* key0 */ + ld1 {v5.16b},[x8],16 /* key1 */ + ld1 {v6.16b},[x8],16 /* key2 */ + ld1 {v7.16b},[x8],16 /* key3 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v8.16b},[x9],16 /* rk[0] */ + sha256h2 q23, q21, v4.4s + ld1 {v4.16b},[x8],16 /* key4 */ + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v9.16b},[x9],16 /* rk[1] */ + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v10.16b},[x9],16 /* rk[2] */ + sha256h2 q23, q21, v5.4s + ld1 {v5.16b},[x8],16 /* key5 */ + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x8],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + ld1 {v11.16b},[x9],16 /* rk[3] */ + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256h2 q23, q21, v7.4s + ld1 {v7.16b},[x8],16 /* key7 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v12.16b},[x9],16 /* rk[4] */ + sha256h2 q23, q21, v4.4s + ld1 {v4.16b},[x8],16 /* key4 */ + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v13.16b},[x9],16 /* rk[5] */ + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v14.16b},[x9],16 /* rk[6] */ + sha256h2 q23, q21, v5.4s + ld1 {v5.16b},[x8],16 /* key5 */ + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x8],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + ld1 {v15.16b},[x9],16 /* rk[7] */ + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256h2 q23, q21, v7.4s + ld1 {v7.16b},[x8],16 /* key7 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 2 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v16.16b},[x9],16 /* rk[8] */ + sha256h2 q23, q21, v4.4s + ld1 {v4.16b},[x8],16 /* key4 */ + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v17.16b},[x9],16 /* rk[9] */ + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v18.16b},[x9],16 /* rk[10] */ + sha256h2 q23, q21, v5.4s + ld1 {v5.16b},[x8],16 /* key5 */ + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x8],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256h2 q23, q21, v7.4s + ld1 {v7.16b},[x8],16 /* key7 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 3 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256h2 q23, q21, v4.4s + ld1 {v26.16b},[x3],16 /* next w0 */ + ld1 {v27.16b},[x3],16 /* next w1 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h2 q23, q21, v5.4s + ld1 {v28.16b},[x3],16 /* next w2 */ + ld1 {v29.16b},[x3],16 /* next w3 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + /* substract loaded bytes */ + sub x5,x5,64 + + /* + * aes_blocks_left := number after the main (sha) block is done. + * can be 0 note we account for the extra unwind in main_blocks + */ + sub x15,x12,2 /* main_blocks=total_blocks-5 */ + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + and x13,x10,3 /* aes_blocks_left */ + ld1 {v0.16b},[x0] /* next aes block, no update */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + add x9,x0,128 /* lead_ptr = *in */ + /* next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + + /* indicate AES blocks to write back */ + mov x19,xzr +/* + * main combined loop CBC, can be used by auth/enc version + */ +.Ldec_main_loop: + /* + * Because both mov, rev32 and eor have a busy cycle, this takes longer + * than it looks. + */ + rev32 v26.16b,v26.16b /* fix endian w0 */ + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + prfm PLDL1KEEP,[x9,64] /* pref next lead_ptr */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + /* pref next aes_ptr_out, streaming */ + prfm PLDL1KEEP,[x1,64] + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + /* base address for sha round consts */ + adr x8,.Lrcon + + /* + * aes xform 0, sha quad 0 + */ + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + ld1 {v4.16b},[x8],16 /* key0 */ + rev32 v28.16b,v28.16b /* fix endian w2 */ + + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + ld1 {v5.16b},[x8],16 /* key1 */ + sha256su0 v26.4s,v27.4s + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + ld1 {v6.16b},[x8],16 /* key2 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + ld1 {v7.16b},[x8],16 /* key3 */ + rev32 v29.16b,v29.16b /* fix endian w3 */ + /* read next aes block, no update */ + ld1 {v1.16b},[x0] + sha256h2 q23, q21, v4.4s + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + sha256su1 v26.4s,v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + ld1 {v4.16b},[x8],16 /* key4 */ + sha256su0 v27.4s,v28.4s + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + sha256h q22, q23, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256h2 q23, q21, v5.4s + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + ld1 {v5.16b},[x8],16 /* key5 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + sha256su0 v28.4s,v29.4s + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + sha256su1 v28.4s,v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v29.4s,v26.4s + sha256h q22, q23, v7.4s +___ + &aes192_aes256_dec_handle(1,"dec_mainloop",0,0); +$code.=<<___; + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + ld1 {v6.16b},[x8],16 /* key6 */ + eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */ + /* get next aes block, with update */ + ld1 {v30.16b},[x0],16 + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + + /* aes xform 1, sha quad 1 */ + sha256su0 v26.4s,v27.4s + ld1 {v7.16b},[x8],16 /* key7 */ + mov v21.16b, v22.16b /* copy abcd */ + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + aesd v1.16b,v8.16b + aesimc v1.16b,v1.16b + sha256h q22, q23, v4.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + aesd v1.16b,v9.16b + aesimc v1.16b,v1.16b + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + aesd v1.16b,v10.16b + aesimc v1.16b,v1.16b + /* read next aes block, no update */ + ld1 {v2.16b},[x0] + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + ld1 {v4.16b},[x8],16 /* key4 */ + aesd v1.16b,v11.16b + aesimc v1.16b,v1.16b + ld1 {v5.16b},[x8],16 /* key5 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v28.4s,v29.4s + sha256h q22, q23, v6.4s + aesd v1.16b,v12.16b + aesimc v1.16b,v1.16b + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x8],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + sha256su0 v29.4s,v26.4s + aesd v1.16b,v13.16b + aesimc v1.16b,v1.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + aesd v1.16b,v14.16b + aesimc v1.16b,v1.16b + ld1 {v7.16b},[x8],16 /* key7 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + aesd v1.16b,v15.16b + aesimc v1.16b,v1.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ +___ + &aes192_aes256_dec_handle(1,"dec_mainloop",1,0); +$code.=<<___; + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + + /* aes xform 2, sha quad 2 */ + sha256su0 v26.4s,v27.4s + aesd v2.16b,v8.16b + aesimc v2.16b,v2.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + aesd v2.16b,v9.16b + aesimc v2.16b,v2.16b + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v4.16b},[x8],16 /* key4 */ + sha256su0 v27.4s,v28.4s + aesd v2.16b,v10.16b + aesimc v2.16b,v2.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + aesd v2.16b,v11.16b + aesimc v2.16b,v2.16b + sha256su1 v27.4s,v29.4s,v26.4s + ld1 {v5.16b},[x8],16 /* key5 */ + sha256su0 v28.4s,v29.4s + aesd v2.16b,v12.16b + aesimc v2.16b,v2.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + aesd v2.16b,v13.16b + aesimc v2.16b,v2.16b + sha256su1 v28.4s,v26.4s,v27.4s + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su0 v29.4s,v26.4s + /* read next aes block, no update */ + ld1 {v3.16b},[x0] + aesd v2.16b,v14.16b + aesimc v2.16b,v2.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + aesd v2.16b,v15.16b + aesimc v2.16b,v2.16b + sha256su1 v29.4s,v27.4s,v28.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + ld1 {v6.16b},[x8],16 /* key6 */ + ld1 {v7.16b},[x8],16 /* key7 */ + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b +___ + &aes192_aes256_dec_handle(1,"dec_mainloop",2,0); +$code.=<<___; + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + + /* aes xform 3, sha quad 3 (hash only) */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + aesd v3.16b,v9.16b + aesimc v3.16b,v3.16b + ld1 {v26.16b},[x3],16 /* next w0 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + aesd v3.16b,v10.16b + aesimc v3.16b,v3.16b + ld1 {v27.16b},[x3],16 /* next w1 */ + aesd v3.16b,v11.16b + aesimc v3.16b,v3.16b + ld1 {v28.16b},[x3],16 /* next w2 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + aesd v3.16b,v12.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v13.16b + aesimc v3.16b,v3.16b + ld1 {v29.16b},[x3],16 /* next w3 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + aesd v3.16b,v14.16b + aesimc v3.16b,v3.16b + sub x15,x15,1 /* dec block count */ + aesd v3.16b,v15.16b + aesimc v3.16b,v3.16b + ld1 {v0.16b},[x0] /* next aes block, no update */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b +___ + &aes192_aes256_dec_handle(1,"dec_mainloop",3,0); +$code.=<<___; + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */ + /* next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + /* substract loaded bytes */ + sub x5,x5,64 + cbnz x15,.Ldec_main_loop /* loop if more to do */ + /* + * Now the loop epilog. Since the reads for sha have already been done + * in advance, we have to have an extra unwind. + * This is why the test for the short cases is 16 and not 12. + * + * The unwind, which is just the main loop without the tests or final reads. + */ + + rev32 v26.16b,v26.16b /* fix endian w0 */ + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + /* pref next aes_ptr_out, streaming */ + prfm PLDL1KEEP,[x1,64] + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + /* base address for sha round consts */ + adr x8,.Lrcon + ld1 {v4.16b},[x8],16 /* key0 */ + ld1 {v5.16b},[x8],16 /* key1 */ + + /* + * aes xform 0, sha quad 0 + */ + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + ld1 {v6.16b},[x8],16 /* key2 */ + rev32 v28.16b,v28.16b /* fix endian w2 */ + ld1 {v7.16b},[x8],16 /* key3 */ + /* read next aes block, no update */ + ld1 {v1.16b},[x0] + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su0 v26.4s,v27.4s + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + rev32 v29.16b,v29.16b /* fix endian w3 */ + sha256h2 q23, q21, v4.4s + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + sha256su1 v26.4s,v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + ld1 {v4.16b},[x8],16 /* key4 */ + sha256su0 v27.4s,v28.4s + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + sha256h q22, q23, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256h2 q23, q21, v5.4s + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + ld1 {v5.16b},[x8],16 /* key5 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + sha256su0 v28.4s,v29.4s + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + sha256su1 v28.4s,v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v29.4s,v26.4s + sha256h q22, q23, v7.4s +___ + &aes192_aes256_dec_handle(1,"dec_epilog",0,0); +$code.=<<___; + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + ld1 {v6.16b},[x8],16 /* key6 */ + eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + + /* aes xform 1, sha quad 1 */ + sha256su0 v26.4s,v27.4s + ld1 {v7.16b},[x8],16 /* key7 */ + mov v21.16b, v22.16b /* copy abcd */ + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + aesd v1.16b,v8.16b + aesimc v1.16b,v1.16b + sha256h q22, q23, v4.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + aesd v1.16b,v9.16b + aesimc v1.16b,v1.16b + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + aesd v1.16b,v10.16b + aesimc v1.16b,v1.16b + /* read next aes block, no update */ + ld1 {v2.16b},[x0] + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + + ld1 {v4.16b},[x8],16 /* key4 */ + aesd v1.16b,v11.16b + aesimc v1.16b,v1.16b + ld1 {v5.16b},[x8],16 /* key5 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v28.4s,v29.4s + sha256h q22, q23, v6.4s + aesd v1.16b,v12.16b + aesimc v1.16b,v1.16b + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x8],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + sha256su0 v29.4s,v26.4s + aesd v1.16b,v13.16b + aesimc v1.16b,v1.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + aesd v1.16b,v14.16b + aesimc v1.16b,v1.16b + ld1 {v7.16b},[x8],16 /* key7 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + aesd v1.16b,v15.16b + aesimc v1.16b,v1.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ +___ + &aes192_aes256_dec_handle(1,"dec_epilog",1,0); +$code.=<<___; + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + + /* mode op 2 */ + + /* aes xform 2, sha quad 2 */ + sha256su0 v26.4s,v27.4s + aesd v2.16b,v8.16b + aesimc v2.16b,v2.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + aesd v2.16b,v9.16b + aesimc v2.16b,v2.16b + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v4.16b},[x8],16 /* key4 */ + sha256su0 v27.4s,v28.4s + aesd v2.16b,v10.16b + aesimc v2.16b,v2.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + aesd v2.16b,v11.16b + aesimc v2.16b,v2.16b + sha256su1 v27.4s,v29.4s,v26.4s + ld1 {v5.16b},[x8],16 /* key5 */ + sha256su0 v28.4s,v29.4s + aesd v2.16b,v12.16b + aesimc v2.16b,v2.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + aesd v2.16b,v13.16b + aesimc v2.16b,v2.16b + sha256su1 v28.4s,v26.4s,v27.4s + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su0 v29.4s,v26.4s + /* read next aes block, no update */ + ld1 {v3.16b},[x0] + aesd v2.16b,v14.16b + aesimc v2.16b,v2.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + aesd v2.16b,v15.16b + aesimc v2.16b,v2.16b + sha256su1 v29.4s,v27.4s,v28.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + ld1 {v6.16b},[x8],16 /* key6 */ + ld1 {v7.16b},[x8],16 /* key7 */ + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b +___ + &aes192_aes256_dec_handle(1,"dec_epilog",2,0); +$code.=<<___; + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + + /* mode op 3 */ + + /* aes xform 3, sha quad 3 (hash only) */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + aesd v3.16b,v9.16b + aesimc v3.16b,v3.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + aesd v3.16b,v10.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v11.16b + aesimc v3.16b,v3.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + aesd v3.16b,v12.16b + aesimc v3.16b,v3.16b + /* read first aes block, no bump */ + ld1 {v0.16b},[x0] + aesd v3.16b,v13.16b + aesimc v3.16b,v3.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + aesd v3.16b,v14.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v15.16b + aesimc v3.16b,v3.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b +___ + &aes192_aes256_dec_handle(1,"dec_epilog",3,0); +$code.=<<___; + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + eor v3.16b,v3.16b,v31.16b /* xor w/prev value */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v31.16b},[x0],16 + + + /* + * now we have to do the 4 aes blocks (b-2) that catch up to where sha is + */ + + /* aes xform 0 */ + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + /* read next aes block, no update */ + ld1 {v1.16b},[x0] + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b +___ + &aes192_aes256_dec_handle(1,"dec_catchup",0,0); +$code.=<<___; + eor v0.16b,v0.16b,v30.16b /* xor w/ ivec (modeop) */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + + /* aes xform 1 */ + /* read next aes block, no update */ + ld1 {v2.16b},[x0] + aesd v1.16b,v8.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v9.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v10.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v11.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v12.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v13.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v14.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v15.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b +___ + &aes192_aes256_dec_handle(1,"dec_catchup",1,0); +$code.=<<___; + eor v1.16b,v1.16b,v31.16b /* xor w/ ivec (modeop) */ + /* read next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + + /* aes xform 2 */ + aesd v2.16b,v8.16b + aesimc v2.16b,v2.16b + /* read next aes block, no update */ + ld1 {v3.16b},[x0] + aesd v2.16b,v9.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v10.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v11.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v12.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v13.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v14.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v15.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b +___ + &aes192_aes256_dec_handle(1,"dec_catchup",2,0); +$code.=<<___; + eor v2.16b,v2.16b,v30.16b /* xor w/ ivec (modeop) */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + + /* aes xform 3 */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v9.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v10.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v11.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v12.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v13.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v14.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v15.16b + aesimc v3.16b,v3.16b + eor v26.16b,v26.16b,v26.16b + eor v27.16b,v27.16b,v27.16b + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b + eor v28.16b,v28.16b,v28.16b + eor v29.16b,v29.16b,v29.16b +___ + &aes192_aes256_dec_handle(1,"dec_catchup",3,0); +$code.=<<___; + eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */ + + add x19,x19,4 +/* + * Now, there is the final b-1 sha256 padded block. + * This contains between 0-3 aes blocks. We take some pains to avoid read spill + * by only reading the blocks that are actually defined. + * This is also the final sha block code for the shortCases. + */ +.Ljoin_common: + /* base address for sha round consts */ + adr x8,.Lrcon + mov w15,0x80 /* that's the 1 of the pad */ +.Lpost_loop_Q0: + /* assume this was final block */ + mov v26.b[0],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + /* overwrite previous v26 value (0x80) */ + mov v26.d[0],x2 + /* assume this was final block */ + mov v26.b[8],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + mov v26.d[1],x2 +.Lpost_loop_Q1: + /* assume this is final block */ + mov v27.b[0],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + /* overwrite previous v27 value (0x80) */ + mov v27.d[0],x2 + /* assume this was final block */ + mov v27.b[8],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + mov v27.d[1],x2 +.Lpost_loop_Q2: + /* assume this was final block */ + mov v28.b[0],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + /* overwrite previous v28 value (0x80) */ + mov v28.d[0],x2 + /* assume this was final block */ + mov v28.b[8],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + mov v28.d[1],x2 +.Lpost_loop_Q3: + /* assume this was final block */ + mov v29.b[3],w15 + /* outstanding 8B blocks left */ + cbz x5,.Lpost_loop + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + sub x5,x5,8 + rev32 x2,x2 + /* overwrite previous v29 value (0x80) */ + mov v29.d[0],x2 + /* assume this was final block */ + mov v29.b[11],w15 + /* outstanding 8B blocks left */ + cbz x5,1f + /* at least 8B left to go, it is safe to fetch this data */ + ldr x2,[x3],8 + rev32 x2,x2 + mov v29.d[1],x2 + +/* + * That is enough of blocks, we allow up to 64 bytes in total. + * Now we have the sha256 to do for these 4 16B blocks. + */ +1: + mov x9,x8 + rev32 v26.16b,v26.16b + ld1 {v4.16b},[x9],16 /* key0 */ + rev32 v27.16b,v27.16b + rev32 v28.16b,v28.16b + ld1 {v5.16b},[x9],16 /* key1 */ + //rev32 v29.16b,v29.16b + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + + sha256su0 v26.4s,v27.4s + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x9],16 /* key2 */ + sha256h2 q23, q21, v4.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x9],16 /* key3 */ + sha256h2 q23, q21, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 1 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256h2 q23, q21, v4.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256h2 q23, q21, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x9],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 2 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256h2 q23, q21, v4.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256h2 q23, q21, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 3 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x9],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */ + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x9],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */ + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */ + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ + sha256h2 q23, q21, v7.4s + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + + /* this was final block */ + cbz x5,.Lpost_loop + subs x5,x5,8 + /* loop if hash is not finished */ + b.ne .Lpost_loop_Q0 + /* set "1" of the padding if this was a final block */ + mov v26.b[0],w15 + +.Lpost_loop: + /* Add outstanding bytes of digest source */ + add x11,x11,x20 + /* Add one SHA-2 block since hash is calculated including i_key_pad */ + add x11,x11,#64 + lsr x12,x11,32 /* len_hi */ + and x14,x11,0xffffffff /* len_lo */ + lsl x12,x12,3 /* len_hi in bits */ + lsl x14,x14,3 /* len_lo in bits */ + + mov v29.s[3],w14 /* len_lo */ + mov v29.s[2],w12 /* len_hi */ + + rev32 v26.16b,v26.16b /* fix endian w0 */ + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + rev32 v28.16b,v28.16b /* fix endian w2 */ + + /* skip write back if there were less than 4 AES blocks */ + cbz x19,1f + /* + * At this point all data should be fetched for SHA. + * Save remaining blocks without danger of overwriting SHA source. + */ + stp q0,q1,[x1],32 + stp q2,q3,[x1],32 +1: + /* + * final sha block + * the strategy is to combine the 0-3 aes blocks, which is faster but + * a little gourmand on code space. + */ + cbz x13,.Lzero_aes_blocks_left /* none to do */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0] + ld1 {v31.16b},[x0],16 + adr x8,.Lrcon + ld1 {v4.16b},[x8],16 /* key0 */ + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + ld1 {v5.16b},[x8],16 /* key1 */ + ld1 {v6.16b},[x8],16 /* key2 */ + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + ld1 {v7.16b},[x8],16 /* key3 */ + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + sha256su0 v27.4s,v28.4s + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + sha256su1 v27.4s,v29.4s,v26.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s +___ + &aes192_aes256_dec_handle(1,"dec_final1",0,0); +$code.=<<___; + sha256su1 v29.4s,v27.4s,v28.4s + eor v3.16b,v0.16b,v30.16b /* xor w/ ivec (modeop) */ + + sub x13,x13,1 /* dec counter */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + cbz x13,.Lfrmquad1 + + /* aes xform 1 */ + + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0] + ld1 {v30.16b},[x0],16 + ld1 {v4.16b},[x8],16 /* key4 */ + ld1 {v5.16b},[x8],16 /* key5 */ + ld1 {v6.16b},[x8],16 /* key6 */ + ld1 {v7.16b},[x8],16 /* key7 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + sha256su1 v26.4s,v28.4s,v29.4s + sha256su0 v27.4s,v28.4s + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + sha256su1 v27.4s,v29.4s,v26.4s + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + sha256su1 v28.4s,v26.4s,v27.4s + sha256su0 v29.4s,v26.4s + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s +___ + &aes192_aes256_dec_handle(1,"dec_final2",0,0); +$code.=<<___; + sha256su1 v29.4s,v27.4s,v28.4s + eor v3.16b,v0.16b,v31.16b /* xor w/ ivec (modeop) */ + + sub x13,x13,1 /* dec counter */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + cbz x13,.Lfrmquad2 + + /* aes xform 2 */ + + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0],16 + ld1 {v4.16b},[x8],16 /* key4 */ + ld1 {v5.16b},[x8],16 /* key5 */ + ld1 {v6.16b},[x8],16 /* key6 */ + ld1 {v7.16b},[x8],16 /* key7 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + sha256su1 v28.4s,v26.4s,v27.4s + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s +___ + &aes192_aes256_dec_handle(1,"dec_final3",0,0); +$code.=<<___; + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + eor v3.16b,v0.16b,v30.16b /* xor w/ ivec (modeop) */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + b .Lfrmquad3 +/* + * the final block with no aes component, i.e from here there were zero blocks + */ + +.Lzero_aes_blocks_left: + /* base address for sha round consts */ + adr x8,.Lrcon + ld1 {v4.16b},[x8],16 /* key0 */ + ld1 {v5.16b},[x8],16 /* key1 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x8],16 /* key2 */ + sha256h2 q23, q21, v4.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x8],16 /* key3 */ + sha256h2 q23, q21, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 1 */ +.Lfrmquad1: + ld1 {v4.16b},[x8],16 /* key4 */ + ld1 {v5.16b},[x8],16 /* key5 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + sha256h2 q23, q21, v4.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + sha256h2 q23, q21, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 2 */ +.Lfrmquad2: + ld1 {v4.16b},[x8],16 /* key4 */ + ld1 {v5.16b},[x8],16 /* key5 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + sha256h2 q23, q21, v4.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + sha256h2 q23, q21, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 3 */ +.Lfrmquad3: + ld1 {v4.16b},[x8],16 /* key4 */ + ld1 {v5.16b},[x8],16 /* key5 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + eor v26.16b,v26.16b,v26.16b /* zero reg */ + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + eor v27.16b,v27.16b,v27.16b /* zero reg */ + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + eor v28.16b,v28.16b,v28.16b /* zero reg */ + sha256h2 q23, q21, v7.4s + + add v26.4s,v24.4s,v22.4s /* ABCD += working copy */ + eor v29.16b,v29.16b,v29.16b /* zero reg */ + add v27.4s,v25.4s,v23.4s /* EFGH += working copy */ + + /* + * Calculate final HMAC + */ + /* base address for sha round consts */ + adr x8,.Lrcon + /* load o_key_pad partial hash */ + ld1 {v24.16b},[x7],16 + ld1 {v25.16b},[x7] + + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + + /* Set padding 1 to the first reg */ + mov w11, #0x80 /* that's the 1 of the pad */ + mov v28.b[3], w11 + /* size of o_key_pad + inner hash */ + mov x11, #64+32 + lsl x11, x11, 3 + /* move length to the end of the block */ + mov v29.s[3], w11 + lsr x11, x11, 32 + mov v29.s[2], w11 /* and the higher part */ + + ld1 {v4.16b},[x8],16 /* key0 */ + ld1 {v5.16b},[x8],16 /* key1 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x8],16 /* key2 */ + sha256h2 q23, q21, v4.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x8],16 /* key3 */ + sha256h2 q23, q21, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + ld1 {v4.16b},[x8],16 /* key4 */ + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + ld1 {v5.16b},[x8],16 /* key5 */ + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x8],16 /* key6 */ + sha256h2 q23, q21, v4.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x8],16 /* key7 */ + sha256h2 q23, q21, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + ld1 {v4.16b},[x8],16 /* key8 */ + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + ld1 {v5.16b},[x8],16 /* key9 */ + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key8+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x8],16 /* key10 */ + sha256h2 q23, q21, v4.4s + add v5.4s,v5.4s,v27.4s /* wk = key9+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x8],16 /* key11 */ + sha256h2 q23, q21, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key10+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + ld1 {v4.16b},[x8],16 /* key12 */ + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key11+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + ld1 {v5.16b},[x8],16 /* key13 */ + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key12+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + mov v21.16b, v22.16b /* copy abcd */ + + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x8],16 /* key14 */ + add v5.4s,v5.4s,v27.4s /* wk = key13+w1 */ + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x8],16 /* key15 */ + add v6.4s,v6.4s,v28.4s /* wk = key14+w2 */ + + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + + sha256h q22, q23, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key15+w3 */ + ldp d10,d11,[sp,#16] + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + + sha256h q22, q23, v7.4s + ldp d12,d13,[sp,#32] + sha256h2 q23, q21, v7.4s + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + ldp d14,d15,[sp,#48] + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + + rev32 v24.16b, v24.16b + ldp x19,x20,[sp,#64] + ldp d8,d9,[sp],#80 + rev32 v25.16b, v25.16b + st1 {v24.4s},[x4],16 + mov x0, xzr + st1 {v25.4s},[x4] + + ret + +/* + * These are the short cases (less efficient), here used for 1-11 aes blocks. + * x10 = aes_blocks + */ +.Ldec_short_cases: + ldp q8,q9,[x9],32 + adr x8,.Lrcon /* rcon */ + ldp q10,q11,[x9],32 + lsl x11,x10,4 /* len=aes_blocks*16 */ + + ldp q12,q13,[x9],32 + ldp q14,q15,[x9],32 + ld1 {v30.16b},[x6] /* get ivec */ + ldp q16,q17,[x9],32 + ld1 {v18.16b},[x9] + + /* get outstanding bytes of the digest */ + sub x20,x5,x2 + + /* indicate AES blocks to write back */ + mov x19,xzr + + mov x2,x0 + + /* + * Digest source has to be at least of cipher source length + * therefore it is safe to use x10 to indicate whether we can + * overtake cipher processing by 4 AES block here. + */ + cmp x10,4 /* check if 4 or more */ + /* if less, bail to last block */ + blt .Llast_sha_block + + sub x5,x5,64 + + mov x9,x8 /* top of rcon */ + + /* quad 0 */ + ld1 {v26.16b},[x3],16 + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v27.16b},[x3],16 + rev32 v26.16b,v26.16b + ld1 {v28.16b},[x3],16 + rev32 v27.16b,v27.16b + ld1 {v29.16b},[x3],16 + rev32 v28.16b,v28.16b + ld1 {v5.16b},[x9],16 /* key1 */ + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + rev32 v29.16b,v29.16b + + sha256su0 v26.4s,v27.4s + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x9],16 /* key2 */ + sha256h2 q23, q21, v4.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x9],16 /* key3 */ + sha256h2 q23, q21, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 1 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256h2 q23, q21, v4.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256h2 q23, q21, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 2 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256h2 q23, q21, v4.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256h2 q23, q21, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 3 */ + mov v21.16b, v22.16b /* copy abcd */ + + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x9],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x9],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + + /* there were at least 4 AES blocks to process */ + b .Lshort_loop_no_store + +.Ldec_short_loop: + cmp x10,4 /* check if 4 or more */ + /* if less, bail to last block */ + blt .Llast_sha_block + stp q0,q1,[x1],32 + stp q2,q3,[x1],32 + + sub x19,x19,4 + +.Lshort_loop_no_store: + ld1 {v31.16b},[x2] /* next w no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v0.16b},[x2],16 + + add x0,x0,64 + + /* aes xform 0 */ + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b +___ + &aes192_aes256_dec_handle(1,"dec_short",0,0); +$code.=<<___; + eor v0.16b,v0.16b,v30.16b /* xor w/prev value */ + + ld1 {v30.16b},[x2] /* read no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x2],16 + + /* aes xform 1 */ + aesd v1.16b,v8.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v9.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v10.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v11.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v12.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v13.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v14.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v15.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b +___ + &aes192_aes256_dec_handle(1,"dec_short",1,0); +$code.=<<___; + eor v1.16b,v1.16b,v31.16b /* xor w/prev value */ + + ld1 {v31.16b},[x2] /* read no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x2],16 + + /* aes xform 2 */ + aesd v2.16b,v8.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v9.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v10.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v11.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v12.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v13.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v14.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v15.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b +___ + &aes192_aes256_dec_handle(1,"dec_short",2,0); +$code.=<<___; + eor v2.16b,v2.16b,v30.16b /* xor w/prev value */ + + ld1 {v30.16b},[x2] /* read no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x2],16 + + /* aes xform 3 */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v9.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v10.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v11.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v12.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v13.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v14.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v15.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b +___ + &aes192_aes256_dec_handle(1,"dec_short",3,0); +$code.=<<___; + eor v3.16b,v3.16b,v31.16b /* xor w/prev value */ + + add x19,x19,4 + + sub x10,x10,4 /* 4 less */ + cmp x5,64 + b.lt .Ldec_short_loop /* keep looping */ + + sub x5,x5,64 + + mov x9,x8 /* top of rcon */ + + /* quad 0 */ + ld1 {v26.16b},[x3],16 + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v27.16b},[x3],16 + rev32 v26.16b,v26.16b + ld1 {v28.16b},[x3],16 + rev32 v27.16b,v27.16b + ld1 {v29.16b},[x3],16 + rev32 v28.16b,v28.16b + ld1 {v5.16b},[x9],16 /* key1 */ + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + rev32 v29.16b,v29.16b + + sha256su0 v26.4s,v27.4s + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x9],16 /* key2 */ + sha256h2 q23, q21, v4.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x9],16 /* key3 */ + sha256h2 q23, q21, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 1 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256h2 q23, q21, v4.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256h2 q23, q21, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + ld1 {v5.16b},[x9],16 /* key5 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 2 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256h2 q23, q21, v4.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256h2 q23, q21, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256h2 q23, q21, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + + /* quad 3 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v6.16b},[x9],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v7.16b},[x9],16 /* key7 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + + b .Ldec_short_loop /* keep looping */ +/* + * This is arranged so that we can join the common unwind code that does + * the last sha block and the final 0-3 aes blocks. + */ +.Llast_sha_block: + eor v26.16b,v26.16b,v26.16b /* zero the rest */ + eor v27.16b,v27.16b,v27.16b /* zero the rest */ + eor v28.16b,v28.16b,v28.16b /* zero the rest */ + eor v29.16b,v29.16b,v29.16b /* zero the rest */ + + mov x13,x10 /* copy aes blocks for common */ + b .Ljoin_common /* join common code */ + +.size asm_sha256_hmac_aescbc_dec, .-asm_sha256_hmac_aescbc_dec +___ + +if ($flavour =~ /64/) { + foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/geo; + print $_,"\n"; + } +} + +close STDOUT or die "error closing STDOUT: $!"; \ No newline at end of file diff --git a/crypto/aes/build.info b/crypto/aes/build.info index 11d27d0451c..661b34592f2 100644 --- a/crypto/aes/build.info +++ b/crypto/aes/build.info @@ -31,7 +31,9 @@ IF[{- !$disabled{asm} -}] $AESASM_armv4=aes_cbc.c aes-armv4.S bsaes-armv7.S aesv8-armx.S $AESDEF_armv4=AES_ASM BSAES_ASM - $AESASM_aarch64=aes_core.c aes_cbc.c aesv8-armx.S bsaes-armv8.S vpaes-armv8.S + $AESASM_aarch64=\ + aes_core.c aes_cbc.c aesv8-armx.S bsaes-armv8.S vpaes-armv8.S \ + aes-sha1-armv8.S aes-sha256-armv8.S $AESDEF_aarch64=BSAES_ASM VPAES_ASM $AESASM_parisc11=aes_core.c aes_cbc.c aes-parisc.s @@ -137,6 +139,10 @@ GENERATE[aesv8-armx.S]=asm/aesv8-armx.pl INCLUDE[aesv8-armx.o]=.. GENERATE[vpaes-armv8.S]=asm/vpaes-armv8.pl INCLUDE[vpaes-armv8.o]=.. +GENERATE[aes-sha1-armv8.S]=asm/aes-sha1-armv8.pl +INCLUDE[aes-sha1-armv8.o]=.. +GENERATE[aes-sha256-armv8.S]=asm/aes-sha256-armv8.pl +INCLUDE[aes-sha256-armv8.o]=.. GENERATE[aes-armv4.S]=asm/aes-armv4.pl INCLUDE[aes-armv4.o]=.. diff --git a/crypto/evp/evp_lib.c b/crypto/evp/evp_lib.c index 32ada929e1b..6c44b845c99 100644 --- a/crypto/evp/evp_lib.c +++ b/crypto/evp/evp_lib.c @@ -320,11 +320,12 @@ int EVP_CIPHER_get_type(const EVP_CIPHER *cipher) int evp_cipher_cache_constants(EVP_CIPHER *cipher) { int ok, aead = 0, custom_iv = 0, cts = 0, multiblock = 0, randkey = 0; + int encrypt_then_mac = 0; size_t ivlen = 0; size_t blksz = 0; size_t keylen = 0; unsigned int mode = 0; - OSSL_PARAM params[10]; + OSSL_PARAM params[11]; params[0] = OSSL_PARAM_construct_size_t(OSSL_CIPHER_PARAM_BLOCK_SIZE, &blksz); params[1] = OSSL_PARAM_construct_size_t(OSSL_CIPHER_PARAM_IVLEN, &ivlen); @@ -338,7 +339,9 @@ int evp_cipher_cache_constants(EVP_CIPHER *cipher) &multiblock); params[8] = OSSL_PARAM_construct_int(OSSL_CIPHER_PARAM_HAS_RAND_KEY, &randkey); - params[9] = OSSL_PARAM_construct_end(); + params[9] = OSSL_PARAM_construct_int(OSSL_CIPHER_PARAM_ENCRYPT_THEN_MAC, + &encrypt_then_mac); + params[10] = OSSL_PARAM_construct_end(); ok = evp_do_ciph_getparams(cipher, params) > 0; if (ok) { cipher->block_size = blksz; @@ -357,6 +360,8 @@ int evp_cipher_cache_constants(EVP_CIPHER *cipher) cipher->flags |= EVP_CIPH_FLAG_CUSTOM_CIPHER; if (randkey) cipher->flags |= EVP_CIPH_RAND_KEY; + if (encrypt_then_mac) + cipher->flags |= EVP_CIPH_FLAG_ENC_THEN_MAC; if (OSSL_PARAM_locate_const(EVP_CIPHER_gettable_ctx_params(cipher), OSSL_CIPHER_PARAM_ALGORITHM_ID_PARAMS)) cipher->flags |= EVP_CIPH_FLAG_CUSTOM_ASN1; diff --git a/crypto/objects/obj_dat.h b/crypto/objects/obj_dat.h index 8790de50dd6..4bd45d3558b 100644 --- a/crypto/objects/obj_dat.h +++ b/crypto/objects/obj_dat.h @@ -1350,7 +1350,7 @@ static const unsigned char so[9517] = { 0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x03,0x2E, /* [ 9507] OBJ_SLH_DSA_SHAKE_256f_WITH_SHAKE256 */ }; -#define NUM_NID 1487 +#define NUM_NID 1493 static const ASN1_OBJECT nid_objs[NUM_NID] = { {"UNDEF", "undefined", NID_undef}, {"rsadsi", "RSA Data Security, Inc.", NID_rsadsi, 6, &so[0]}, @@ -2839,14 +2839,22 @@ static const ASN1_OBJECT nid_objs[NUM_NID] = { {"id-hash-slh-dsa-shake-192f-with-shake256", "SLH-DSA-SHAKE-192f-WITH-SHAKE256", NID_SLH_DSA_SHAKE_192f_WITH_SHAKE256, 9, &so[9489]}, {"id-hash-slh-dsa-shake-256s-with-shake256", "SLH-DSA-SHAKE-256s-WITH-SHAKE256", NID_SLH_DSA_SHAKE_256s_WITH_SHAKE256, 9, &so[9498]}, {"id-hash-slh-dsa-shake-256f-with-shake256", "SLH-DSA-SHAKE-256f-WITH-SHAKE256", NID_SLH_DSA_SHAKE_256f_WITH_SHAKE256, 9, &so[9507]}, + {"AES-128-CBC-HMAC-SHA1-ETM", "aes-128-cbc-hmac-sha1-etm", NID_aes_128_cbc_hmac_sha1_etm}, + {"AES-192-CBC-HMAC-SHA1-ETM", "aes-192-cbc-hmac-sha1-etm", NID_aes_192_cbc_hmac_sha1_etm}, + {"AES-256-CBC-HMAC-SHA1-ETM", "aes-256-cbc-hmac-sha1-etm", NID_aes_256_cbc_hmac_sha1_etm}, + {"AES-128-CBC-HMAC-SHA256-ETM", "aes-128-cbc-hmac-sha256-etm", NID_aes_128_cbc_hmac_sha256_etm}, + {"AES-192-CBC-HMAC-SHA256-ETM", "aes-192-cbc-hmac-sha256-etm", NID_aes_192_cbc_hmac_sha256_etm}, + {"AES-256-CBC-HMAC-SHA256-ETM", "aes-256-cbc-hmac-sha256-etm", NID_aes_256_cbc_hmac_sha256_etm}, }; -#define NUM_SN 1478 +#define NUM_SN 1484 static const unsigned int sn_objs[NUM_SN] = { 364, /* "AD_DVCS" */ 419, /* "AES-128-CBC" */ 916, /* "AES-128-CBC-HMAC-SHA1" */ + 1487, /* "AES-128-CBC-HMAC-SHA1-ETM" */ 948, /* "AES-128-CBC-HMAC-SHA256" */ + 1490, /* "AES-128-CBC-HMAC-SHA256-ETM" */ 421, /* "AES-128-CFB" */ 650, /* "AES-128-CFB1" */ 653, /* "AES-128-CFB8" */ @@ -2858,7 +2866,9 @@ static const unsigned int sn_objs[NUM_SN] = { 913, /* "AES-128-XTS" */ 423, /* "AES-192-CBC" */ 917, /* "AES-192-CBC-HMAC-SHA1" */ + 1488, /* "AES-192-CBC-HMAC-SHA1-ETM" */ 949, /* "AES-192-CBC-HMAC-SHA256" */ + 1491, /* "AES-192-CBC-HMAC-SHA256-ETM" */ 425, /* "AES-192-CFB" */ 651, /* "AES-192-CFB1" */ 654, /* "AES-192-CFB8" */ @@ -2869,7 +2879,9 @@ static const unsigned int sn_objs[NUM_SN] = { 1199, /* "AES-192-SIV" */ 427, /* "AES-256-CBC" */ 918, /* "AES-256-CBC-HMAC-SHA1" */ + 1489, /* "AES-256-CBC-HMAC-SHA1-ETM" */ 950, /* "AES-256-CBC-HMAC-SHA256" */ + 1492, /* "AES-256-CBC-HMAC-SHA256-ETM" */ 429, /* "AES-256-CFB" */ 652, /* "AES-256-CFB1" */ 655, /* "AES-256-CFB8" */ @@ -4323,7 +4335,7 @@ static const unsigned int sn_objs[NUM_SN] = { 1289, /* "zstd" */ }; -#define NUM_LN 1478 +#define NUM_LN 1484 static const unsigned int ln_objs[NUM_LN] = { 363, /* "AD Time Stamping" */ 405, /* "ANSI X9.62" */ @@ -4758,7 +4770,9 @@ static const unsigned int ln_objs[NUM_LN] = { 606, /* "additional verification" */ 419, /* "aes-128-cbc" */ 916, /* "aes-128-cbc-hmac-sha1" */ + 1487, /* "aes-128-cbc-hmac-sha1-etm" */ 948, /* "aes-128-cbc-hmac-sha256" */ + 1490, /* "aes-128-cbc-hmac-sha256-etm" */ 896, /* "aes-128-ccm" */ 421, /* "aes-128-cfb" */ 650, /* "aes-128-cfb1" */ @@ -4772,7 +4786,9 @@ static const unsigned int ln_objs[NUM_LN] = { 913, /* "aes-128-xts" */ 423, /* "aes-192-cbc" */ 917, /* "aes-192-cbc-hmac-sha1" */ + 1488, /* "aes-192-cbc-hmac-sha1-etm" */ 949, /* "aes-192-cbc-hmac-sha256" */ + 1491, /* "aes-192-cbc-hmac-sha256-etm" */ 899, /* "aes-192-ccm" */ 425, /* "aes-192-cfb" */ 651, /* "aes-192-cfb1" */ @@ -4785,7 +4801,9 @@ static const unsigned int ln_objs[NUM_LN] = { 1199, /* "aes-192-siv" */ 427, /* "aes-256-cbc" */ 918, /* "aes-256-cbc-hmac-sha1" */ + 1489, /* "aes-256-cbc-hmac-sha1-etm" */ 950, /* "aes-256-cbc-hmac-sha256" */ + 1492, /* "aes-256-cbc-hmac-sha256-etm" */ 902, /* "aes-256-ccm" */ 429, /* "aes-256-cfb" */ 652, /* "aes-256-cfb1" */ diff --git a/crypto/objects/obj_mac.num b/crypto/objects/obj_mac.num index 80413e087a0..b4363931112 100644 --- a/crypto/objects/obj_mac.num +++ b/crypto/objects/obj_mac.num @@ -1484,3 +1484,9 @@ SLH_DSA_SHAKE_192s_WITH_SHAKE256 1483 SLH_DSA_SHAKE_192f_WITH_SHAKE256 1484 SLH_DSA_SHAKE_256s_WITH_SHAKE256 1485 SLH_DSA_SHAKE_256f_WITH_SHAKE256 1486 +aes_128_cbc_hmac_sha1_etm 1487 +aes_192_cbc_hmac_sha1_etm 1488 +aes_256_cbc_hmac_sha1_etm 1489 +aes_128_cbc_hmac_sha256_etm 1490 +aes_192_cbc_hmac_sha256_etm 1491 +aes_256_cbc_hmac_sha256_etm 1492 diff --git a/crypto/objects/objects.txt b/crypto/objects/objects.txt index 06fe6f4bdad..9c61c4a642a 100644 --- a/crypto/objects/objects.txt +++ b/crypto/objects/objects.txt @@ -1721,6 +1721,12 @@ sm-scheme 104 10 : SM4-XTS : sm4-xts : AES-256-CBC-HMAC-SHA256 : aes-256-cbc-hmac-sha256 : ChaCha20-Poly1305 : chacha20-poly1305 : ChaCha20 : chacha20 + : AES-128-CBC-HMAC-SHA1-ETM : aes-128-cbc-hmac-sha1-etm + : AES-192-CBC-HMAC-SHA1-ETM : aes-192-cbc-hmac-sha1-etm + : AES-256-CBC-HMAC-SHA1-ETM : aes-256-cbc-hmac-sha1-etm + : AES-128-CBC-HMAC-SHA256-ETM : aes-128-cbc-hmac-sha256-etm + : AES-192-CBC-HMAC-SHA256-ETM : aes-192-cbc-hmac-sha256-etm + : AES-256-CBC-HMAC-SHA256-ETM : aes-256-cbc-hmac-sha256-etm ISO-US 10046 2 1 : dhpublicnumber : X9.42 DH diff --git a/include/crypto/aes_platform.h b/include/crypto/aes_platform.h index 140b78e7cd9..34aa74ecb25 100644 --- a/include/crypto/aes_platform.h +++ b/include/crypto/aes_platform.h @@ -92,8 +92,8 @@ void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len); # endif /* OPENSSL_SYS_AIX || OPENSSL_SYS_MACOSX */ # endif /* PPC */ -# if (defined(__arm__) || defined(__arm) || defined(__aarch64__) || defined(_M_ARM64)) -# include "arm_arch.h" +# if (defined(__arm__) || defined(__arm) || defined(__aarch64__) || defined(_M_ARM64)) +# include "crypto/arm_arch.h" # if __ARM_MAX_ARCH__>=7 # if defined(BSAES_ASM) # define BSAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON) @@ -112,6 +112,13 @@ void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len); # define ARMv8_HWAES_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) # define HWAES_xts_encrypt aes_v8_xts_encrypt # define HWAES_xts_decrypt aes_v8_xts_decrypt +# define HWAES_CBC_HMAC_SHA1_ETM_CAPABLE (HWAES_CAPABLE && \ + (OPENSSL_armcap_P & ARMV8_SHA1)) +# define HWAES_CBC_HMAC_SHA256_ETM_CAPABLE (HWAES_CAPABLE && \ + (OPENSSL_armcap_P & ARMV8_SHA256)) +# ifndef __AARCH64EB__ +# define AES_CBC_HMAC_SHA_ETM_CAPABLE 1 +# endif # endif # define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks # define HWAES_ctr32_encrypt_blocks_unroll12_eor3 aes_v8_ctr32_encrypt_blocks_unroll12_eor3 diff --git a/include/openssl/evp.h b/include/openssl/evp.h index 8e38ab29ea3..4fe841da35e 100644 --- a/include/openssl/evp.h +++ b/include/openssl/evp.h @@ -376,6 +376,7 @@ OSSL_DEPRECATEDIN_3_0 int /* For supplementary wrap cipher support */ # define EVP_CIPH_FLAG_GET_WRAP_CIPHER 0x4000000 # define EVP_CIPH_FLAG_INVERSE_CIPHER 0x8000000 +# define EVP_CIPH_FLAG_ENC_THEN_MAC 0x10000000 /* * Cipher context flag to indicate we can handle wrap mode: if allowed in diff --git a/include/openssl/obj_mac.h b/include/openssl/obj_mac.h index ee71e26458f..0f3d79a889f 100644 --- a/include/openssl/obj_mac.h +++ b/include/openssl/obj_mac.h @@ -5458,6 +5458,30 @@ #define LN_chacha20 "chacha20" #define NID_chacha20 1019 +#define SN_aes_128_cbc_hmac_sha1_etm "AES-128-CBC-HMAC-SHA1-ETM" +#define LN_aes_128_cbc_hmac_sha1_etm "aes-128-cbc-hmac-sha1-etm" +#define NID_aes_128_cbc_hmac_sha1_etm 1487 + +#define SN_aes_192_cbc_hmac_sha1_etm "AES-192-CBC-HMAC-SHA1-ETM" +#define LN_aes_192_cbc_hmac_sha1_etm "aes-192-cbc-hmac-sha1-etm" +#define NID_aes_192_cbc_hmac_sha1_etm 1488 + +#define SN_aes_256_cbc_hmac_sha1_etm "AES-256-CBC-HMAC-SHA1-ETM" +#define LN_aes_256_cbc_hmac_sha1_etm "aes-256-cbc-hmac-sha1-etm" +#define NID_aes_256_cbc_hmac_sha1_etm 1489 + +#define SN_aes_128_cbc_hmac_sha256_etm "AES-128-CBC-HMAC-SHA256-ETM" +#define LN_aes_128_cbc_hmac_sha256_etm "aes-128-cbc-hmac-sha256-etm" +#define NID_aes_128_cbc_hmac_sha256_etm 1490 + +#define SN_aes_192_cbc_hmac_sha256_etm "AES-192-CBC-HMAC-SHA256-ETM" +#define LN_aes_192_cbc_hmac_sha256_etm "aes-192-cbc-hmac-sha256-etm" +#define NID_aes_192_cbc_hmac_sha256_etm 1491 + +#define SN_aes_256_cbc_hmac_sha256_etm "AES-256-CBC-HMAC-SHA256-ETM" +#define LN_aes_256_cbc_hmac_sha256_etm "aes-256-cbc-hmac-sha256-etm" +#define NID_aes_256_cbc_hmac_sha256_etm 1492 + #define SN_dhpublicnumber "dhpublicnumber" #define LN_dhpublicnumber "X9.42 DH" #define NID_dhpublicnumber 920 diff --git a/providers/common/include/prov/providercommon.h b/providers/common/include/prov/providercommon.h index 4a1a043a84e..7621b4b1d62 100644 --- a/providers/common/include/prov/providercommon.h +++ b/providers/common/include/prov/providercommon.h @@ -14,6 +14,8 @@ const OSSL_CORE_HANDLE *FIPS_get_core_handle(OSSL_LIB_CTX *ctx); int ossl_cipher_capable_aes_cbc_hmac_sha1(void); int ossl_cipher_capable_aes_cbc_hmac_sha256(void); +int ossl_cipher_capable_aes_cbc_hmac_sha1_etm(void); +int ossl_cipher_capable_aes_cbc_hmac_sha256_etm(void); OSSL_FUNC_provider_get_capabilities_fn ossl_prov_get_capabilities; diff --git a/providers/defltprov.c b/providers/defltprov.c index eee2178b416..6e33f6ee66b 100644 --- a/providers/defltprov.c +++ b/providers/defltprov.c @@ -229,6 +229,18 @@ static const OSSL_ALGORITHM_CAPABLE deflt_ciphers[] = { ossl_cipher_capable_aes_cbc_hmac_sha256), ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA256, ossl_aes256cbc_hmac_sha256_functions, ossl_cipher_capable_aes_cbc_hmac_sha256), + ALGC(PROV_NAMES_AES_128_CBC_HMAC_SHA1_ETM, ossl_aes128cbc_hmac_sha1_etm_functions, + ossl_cipher_capable_aes_cbc_hmac_sha1_etm), + ALGC(PROV_NAMES_AES_192_CBC_HMAC_SHA1_ETM, ossl_aes192cbc_hmac_sha1_etm_functions, + ossl_cipher_capable_aes_cbc_hmac_sha1_etm), + ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA1_ETM, ossl_aes256cbc_hmac_sha1_etm_functions, + ossl_cipher_capable_aes_cbc_hmac_sha1_etm), + ALGC(PROV_NAMES_AES_128_CBC_HMAC_SHA256_ETM, ossl_aes128cbc_hmac_sha256_etm_functions, + ossl_cipher_capable_aes_cbc_hmac_sha256_etm), + ALGC(PROV_NAMES_AES_192_CBC_HMAC_SHA256_ETM, ossl_aes192cbc_hmac_sha256_etm_functions, + ossl_cipher_capable_aes_cbc_hmac_sha256_etm), + ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA256_ETM, ossl_aes256cbc_hmac_sha256_etm_functions, + ossl_cipher_capable_aes_cbc_hmac_sha256_etm), #ifndef OPENSSL_NO_ARIA ALG(PROV_NAMES_ARIA_256_GCM, ossl_aria256gcm_functions), ALG(PROV_NAMES_ARIA_192_GCM, ossl_aria192gcm_functions), diff --git a/providers/fips/fipsprov.c b/providers/fips/fipsprov.c index 373cd1c2e4c..03258fc97f5 100644 --- a/providers/fips/fipsprov.c +++ b/providers/fips/fipsprov.c @@ -354,6 +354,18 @@ static const OSSL_ALGORITHM_CAPABLE fips_ciphers[] = { ossl_cipher_capable_aes_cbc_hmac_sha256), ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA256, ossl_aes256cbc_hmac_sha256_functions, ossl_cipher_capable_aes_cbc_hmac_sha256), + ALGC(PROV_NAMES_AES_128_CBC_HMAC_SHA1_ETM, ossl_aes128cbc_hmac_sha1_etm_functions, + ossl_cipher_capable_aes_cbc_hmac_sha1_etm), + ALGC(PROV_NAMES_AES_192_CBC_HMAC_SHA1_ETM, ossl_aes192cbc_hmac_sha1_etm_functions, + ossl_cipher_capable_aes_cbc_hmac_sha1_etm), + ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA1_ETM, ossl_aes256cbc_hmac_sha1_etm_functions, + ossl_cipher_capable_aes_cbc_hmac_sha1_etm), + ALGC(PROV_NAMES_AES_128_CBC_HMAC_SHA256_ETM, ossl_aes128cbc_hmac_sha256_etm_functions, + ossl_cipher_capable_aes_cbc_hmac_sha256_etm), + ALGC(PROV_NAMES_AES_192_CBC_HMAC_SHA256_ETM, ossl_aes192cbc_hmac_sha256_etm_functions, + ossl_cipher_capable_aes_cbc_hmac_sha256_etm), + ALGC(PROV_NAMES_AES_256_CBC_HMAC_SHA256_ETM, ossl_aes256cbc_hmac_sha256_etm_functions, + ossl_cipher_capable_aes_cbc_hmac_sha256_etm), #ifndef OPENSSL_NO_DES ALG(PROV_NAMES_DES_EDE3_ECB, ossl_tdes_ede3_ecb_functions), ALG(PROV_NAMES_DES_EDE3_CBC, ossl_tdes_ede3_cbc_functions), diff --git a/providers/implementations/ciphers/build.info b/providers/implementations/ciphers/build.info index 1837070c211..47c140ace11 100644 --- a/providers/implementations/ciphers/build.info +++ b/providers/implementations/ciphers/build.info @@ -105,6 +105,9 @@ SOURCE[$AES_GOAL]=\ cipher_aes_wrp.c \ cipher_aes_cbc_hmac_sha.c \ cipher_aes_cbc_hmac_sha256_hw.c cipher_aes_cbc_hmac_sha1_hw.c \ + cipher_aes_cbc_hmac_sha_etm.c \ + cipher_aes_cbc_hmac_sha1_etm_hw.c \ + cipher_aes_cbc_hmac_sha256_etm_hw.c \ cipher_cts.c DEFINE[$AES_GOAL]=$AESXTSDEF diff --git a/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha1_etm_hw.c b/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha1_etm_hw.c new file mode 100644 index 00000000000..5d164ff5d71 --- /dev/null +++ b/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha1_etm_hw.c @@ -0,0 +1,179 @@ +/* + * Copyright 2023-2024 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ +#include "internal/deprecated.h" +#include "cipher_aes_cbc_hmac_sha_etm.h" + +#if !defined(AES_CBC_HMAC_SHA_ETM_CAPABLE) +int ossl_cipher_capable_aes_cbc_hmac_sha1_etm(void) +{ + return 0; +} + +const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *ossl_prov_cipher_hw_aes_cbc_hmac_sha1_etm(void) +{ + return NULL; +} +#else +void sha1_block_data_order(void *c, const void *p, size_t len); + +# if defined(__aarch64__) +int asm_aescbc_sha1_hmac(const uint8_t *csrc, uint8_t *cdst, uint64_t clen, + uint8_t *dsrc, uint8_t *ddst, uint64_t dlen, + CIPH_DIGEST *arg); +void asm_sha1_hmac_aescbc_dec(const uint8_t *csrc, uint8_t *cdst, uint64_t clen, + const unsigned char *dsrc, uint8_t *ddst, size_t dlen, + CIPH_DIGEST *arg); +# define HWAES128_ENC_CBC_SHA1_ETM asm_aescbc_sha1_hmac +# define HWAES128_DEC_CBC_SHA1_ETM asm_sha1_hmac_aescbc_dec +# endif + +int ossl_cipher_capable_aes_cbc_hmac_sha1_etm(void) +{ + return HWAES_CBC_HMAC_SHA1_ETM_CAPABLE; +} + +static int aes_cbc_hmac_sha1_init_key(PROV_CIPHER_CTX *vctx, + const unsigned char *key, size_t keylen) +{ + int ret; + PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx; + PROV_AES_HMAC_SHA1_ETM_CTX *sctx = (PROV_AES_HMAC_SHA1_ETM_CTX *)vctx; + + if (ctx->base.enc) + ret = aes_v8_set_encrypt_key(key, keylen * 8, &ctx->ks); + else + ret = aes_v8_set_decrypt_key(key, keylen * 8, &ctx->ks); + + SHA1_Init(&sctx->head); /* handy when benchmarking */ + sctx->tail = sctx->head; + + return ret < 0 ? 0 : 1; +} + +static void ciph_digest_arg_init(CIPH_DIGEST *arg, PROV_CIPHER_CTX *vctx) +{ + PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx; + PROV_AES_HMAC_SHA1_ETM_CTX *sctx = (PROV_AES_HMAC_SHA1_ETM_CTX *)vctx; + + arg->cipher.key = (uint8_t *)&(ctx->ks); + arg->cipher.key_rounds = ctx->ks.rounds; + arg->cipher.iv = (uint8_t *)&(ctx->base.iv); + arg->digest.hmac.i_key_pad = (uint8_t *)&(sctx->head); + arg->digest.hmac.o_key_pad = (uint8_t *)&(sctx->tail); +} + +static int hwaes_cbc_hmac_sha1_etm(PROV_CIPHER_CTX *vctx, + unsigned char *out, + const unsigned char *in, size_t len) +{ + PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx; + CIPH_DIGEST arg = {0}; + ciph_digest_arg_init(&arg, vctx); + if (len % AES_BLOCK_SIZE) { + ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_INPUT_LENGTH); + return 0; + } + if (ctx->base.enc) { + HWAES128_ENC_CBC_SHA1_ETM(in, out, len, out, ctx->tag, len, &arg); + return 1; + } else { + if (ctx->taglen == 0) { + ERR_raise(ERR_LIB_PROV, PROV_R_TAG_NOT_SET); + return 0; + } + HWAES128_DEC_CBC_SHA1_ETM(in, out, len, in, ctx->tag, len, &arg); + if (CRYPTO_memcmp(ctx->exp_tag, ctx->tag, ctx->taglen)) { + ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_TAG); + return 0; + } + return 1; + } +} + +static void sha1_update(SHA_CTX *c, const void *data, size_t len) +{ + const unsigned char *ptr = data; + size_t res; + + if ((res = c->num)) { + res = SHA_CBLOCK - res; + if (len < res) + res = len; + SHA1_Update(c, ptr, res); + ptr += res; + len -= res; + } + + res = len % SHA_CBLOCK; + len -= res; + + if (len) { + sha1_block_data_order(c, ptr, len / SHA_CBLOCK); + + ptr += len; + c->Nh += len >> 29; + c->Nl += len <<= 3; + if (c->Nl < (unsigned int)len) + c->Nh++; + } + + if (res) + SHA1_Update(c, ptr, res); +} + +static void aes_cbc_hmac_sha1_set_mac_key(void *vctx, + const unsigned char *mac, size_t len) +{ + PROV_AES_HMAC_SHA1_ETM_CTX *ctx = (PROV_AES_HMAC_SHA1_ETM_CTX *)vctx; + unsigned int i; + unsigned char hmac_key[64]; + + memset(hmac_key, 0, sizeof(hmac_key)); + + if (len > (int)sizeof(hmac_key)) { + SHA1_Init(&ctx->head); + sha1_update(&ctx->head, mac, len); + SHA1_Final(hmac_key, &ctx->head); + } else { + memcpy(hmac_key, mac, len); + } + + for (i = 0; i < sizeof(hmac_key); i++) + hmac_key[i] ^= 0x36; /* ipad */ + SHA1_Init(&ctx->head); + sha1_update(&ctx->head, hmac_key, sizeof(hmac_key)); + + for (i = 0; i < sizeof(hmac_key); i++) + hmac_key[i] ^= 0x36 ^ 0x5c; /* opad */ + SHA1_Init(&ctx->tail); + sha1_update(&ctx->tail, hmac_key, sizeof(hmac_key)); + + OPENSSL_cleanse(hmac_key, sizeof(hmac_key)); +} + +static int aes_cbc_hmac_sha1_cipher(PROV_CIPHER_CTX *vctx, + unsigned char *out, + const unsigned char *in, size_t len) +{ + return hwaes_cbc_hmac_sha1_etm(vctx, out, in, len); +} + +static const PROV_CIPHER_HW_AES_HMAC_SHA_ETM cipher_hw_aes_hmac_sha1_etm = { + { + aes_cbc_hmac_sha1_init_key, + aes_cbc_hmac_sha1_cipher + }, + aes_cbc_hmac_sha1_set_mac_key +}; + +const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *ossl_prov_cipher_hw_aes_cbc_hmac_sha1_etm(void) +{ + return &cipher_hw_aes_hmac_sha1_etm; +} +#endif diff --git a/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha256_etm_hw.c b/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha256_etm_hw.c new file mode 100644 index 00000000000..8a5474fc655 --- /dev/null +++ b/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha256_etm_hw.c @@ -0,0 +1,179 @@ +/* + * Copyright 2023-2024 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ +#include "internal/deprecated.h" +#include "cipher_aes_cbc_hmac_sha_etm.h" + +#if !defined(AES_CBC_HMAC_SHA_ETM_CAPABLE) +int ossl_cipher_capable_aes_cbc_hmac_sha256_etm(void) +{ + return 0; +} + +const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *ossl_prov_cipher_hw_aes_cbc_hmac_sha256_etm(void) +{ + return NULL; +} +#else +void sha256_block_data_order(void *c, const void *p, size_t len); + +# if defined(__aarch64__) +int asm_aescbc_sha256_hmac(const uint8_t *csrc, uint8_t *cdst, uint64_t clen, + uint8_t *dsrc, uint8_t *ddst, uint64_t dlen, + CIPH_DIGEST *arg); +void asm_sha256_hmac_aescbc_dec(const uint8_t *csrc, uint8_t *cdst, uint64_t clen, + const unsigned char *dsrc, uint8_t *ddst, size_t dlen, + CIPH_DIGEST *arg); +# define HWAES128_ENC_CBC_SHA256_ETM asm_aescbc_sha256_hmac +# define HWAES128_DEC_CBC_SHA256_ETM asm_sha256_hmac_aescbc_dec +# endif + +int ossl_cipher_capable_aes_cbc_hmac_sha256_etm(void) +{ + return HWAES_CBC_HMAC_SHA256_ETM_CAPABLE; +} + +static int aes_cbc_hmac_sha256_init_key(PROV_CIPHER_CTX *vctx, + const unsigned char *key, size_t keylen) +{ + int ret; + PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx; + PROV_AES_HMAC_SHA256_ETM_CTX *sctx = (PROV_AES_HMAC_SHA256_ETM_CTX *)vctx; + + if (ctx->base.enc) + ret = aes_v8_set_encrypt_key(key, keylen * 8, &ctx->ks); + else + ret = aes_v8_set_decrypt_key(key, keylen * 8, &ctx->ks); + + SHA256_Init(&sctx->head); /* handy when benchmarking */ + sctx->tail = sctx->head; + + return ret < 0 ? 0 : 1; +} + +static void ciph_digest_arg_init(CIPH_DIGEST *arg, PROV_CIPHER_CTX *vctx) +{ + PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx; + PROV_AES_HMAC_SHA256_ETM_CTX *sctx = (PROV_AES_HMAC_SHA256_ETM_CTX *)vctx; + + arg->cipher.key = (uint8_t *)&(ctx->ks); + arg->cipher.key_rounds = ctx->ks.rounds; + arg->cipher.iv = (uint8_t *)&(ctx->base.iv); + arg->digest.hmac.i_key_pad = (uint8_t *)&(sctx->head); + arg->digest.hmac.o_key_pad = (uint8_t *)&(sctx->tail); +} + +static int hwaes_cbc_hmac_sha256_etm(PROV_CIPHER_CTX *vctx, + unsigned char *out, + const unsigned char *in, size_t len) +{ + PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx; + CIPH_DIGEST arg = {0}; + ciph_digest_arg_init(&arg, vctx); + if (len % AES_BLOCK_SIZE) { + ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_INPUT_LENGTH); + return 0; + } + if (ctx->base.enc) { + HWAES128_ENC_CBC_SHA256_ETM(in, out, len, out, ctx->tag, len, &arg); + return 1; + } else { + if (ctx->taglen == 0) { + ERR_raise(ERR_LIB_PROV, PROV_R_TAG_NOT_SET); + return 0; + } + HWAES128_DEC_CBC_SHA256_ETM(in, out, len, in, ctx->tag, len, &arg); + if (CRYPTO_memcmp(ctx->exp_tag, ctx->tag, ctx->taglen)) { + ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_TAG); + return 0; + } + return 1; + } +} + +static void sha256_update(SHA256_CTX *c, const void *data, size_t len) +{ + const unsigned char *ptr = data; + size_t res; + + if ((res = c->num)) { + res = SHA256_CBLOCK - res; + if (len < res) + res = len; + SHA256_Update(c, ptr, res); + ptr += res; + len -= res; + } + + res = len % SHA256_CBLOCK; + len -= res; + + if (len) { + sha256_block_data_order(c, ptr, len / SHA256_CBLOCK); + + ptr += len; + c->Nh += len >> 29; + c->Nl += len <<= 3; + if (c->Nl < (unsigned int)len) + c->Nh++; + } + + if (res) + SHA256_Update(c, ptr, res); +} + +static void aes_cbc_hmac_sha256_set_mac_key(void *vctx, + const unsigned char *mac, size_t len) +{ + PROV_AES_HMAC_SHA256_ETM_CTX *ctx = (PROV_AES_HMAC_SHA256_ETM_CTX *)vctx; + unsigned int i; + unsigned char hmac_key[64]; + + memset(hmac_key, 0, sizeof(hmac_key)); + + if (len > (int)sizeof(hmac_key)) { + SHA256_Init(&ctx->head); + sha256_update(&ctx->head, mac, len); + SHA256_Final(hmac_key, &ctx->head); + } else { + memcpy(hmac_key, mac, len); + } + + for (i = 0; i < sizeof(hmac_key); i++) + hmac_key[i] ^= 0x36; /* ipad */ + SHA256_Init(&ctx->head); + sha256_update(&ctx->head, hmac_key, sizeof(hmac_key)); + + for (i = 0; i < sizeof(hmac_key); i++) + hmac_key[i] ^= 0x36 ^ 0x5c; /* opad */ + SHA256_Init(&ctx->tail); + sha256_update(&ctx->tail, hmac_key, sizeof(hmac_key)); + + OPENSSL_cleanse(hmac_key, sizeof(hmac_key)); +} + +static int aes_cbc_hmac_sha256_cipher(PROV_CIPHER_CTX *vctx, + unsigned char *out, + const unsigned char *in, size_t len) +{ + return hwaes_cbc_hmac_sha256_etm(vctx, out, in, len); +} + +static const PROV_CIPHER_HW_AES_HMAC_SHA_ETM cipher_hw_aes_hmac_sha256_etm = { + { + aes_cbc_hmac_sha256_init_key, + aes_cbc_hmac_sha256_cipher + }, + aes_cbc_hmac_sha256_set_mac_key +}; + +const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *ossl_prov_cipher_hw_aes_cbc_hmac_sha256_etm(void) +{ + return &cipher_hw_aes_hmac_sha256_etm; +} +#endif diff --git a/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha_etm.c b/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha_etm.c new file mode 100644 index 00000000000..0292511353d --- /dev/null +++ b/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha_etm.c @@ -0,0 +1,310 @@ +/* + * Copyright 2024 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ +#include "internal/deprecated.h" + +#include "cipher_aes_cbc_hmac_sha_etm.h" +#include "prov/providercommon.h" +#include "prov/ciphercommon_aead.h" +#include "prov/implementations.h" + +#ifndef AES_CBC_HMAC_SHA_ETM_CAPABLE +# define IMPLEMENT_CIPHER(nm, sub, kbits, blkbits, ivbits, flags) \ +const OSSL_DISPATCH ossl_##nm##kbits##sub##_functions[] = { \ + OSSL_DISPATCH_END \ +}; +#else +static OSSL_FUNC_cipher_encrypt_init_fn aes_einit; +static OSSL_FUNC_cipher_decrypt_init_fn aes_dinit; +static OSSL_FUNC_cipher_gettable_ctx_params_fn aes_gettable_ctx_params; +static OSSL_FUNC_cipher_settable_ctx_params_fn aes_settable_ctx_params; +# define aes_gettable_params ossl_cipher_generic_gettable_params +# define aes_update ossl_cipher_generic_stream_update +# define aes_final ossl_cipher_generic_stream_final +# define aes_cipher ossl_cipher_generic_cipher + +static int aes_set_ctx_params(void *vctx, const OSSL_PARAM params[]) +{ + PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx; + PROV_CIPHER_HW_AES_HMAC_SHA_ETM *hw = + (PROV_CIPHER_HW_AES_HMAC_SHA_ETM *)ctx->hw; + const OSSL_PARAM *p; + + if (params == NULL) + return 1; + + p = OSSL_PARAM_locate_const(params, OSSL_CIPHER_PARAM_AEAD_MAC_KEY); + if (p != NULL) { + if (p->data_type != OSSL_PARAM_OCTET_STRING) { + ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_GET_PARAMETER); + return 0; + } + hw->init_mac_key(ctx, p->data, p->data_size); + } + + p = OSSL_PARAM_locate_const(params, OSSL_CIPHER_PARAM_KEYLEN); + if (p != NULL) { + size_t keylen; + + if (!OSSL_PARAM_get_size_t(p, &keylen)) { + ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_GET_PARAMETER); + return 0; + } + if (ctx->base.keylen != keylen) { + ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_KEY_LENGTH); + return 0; + } + } + + p = OSSL_PARAM_locate_const(params, OSSL_CIPHER_HMAC_PARAM_MAC); + if (p != NULL) { + size_t sz; + void *vp; + + vp = &ctx->exp_tag; + if (!OSSL_PARAM_get_octet_string(p, &vp, AES_CBC_MAX_HMAC_SIZE, &sz)) { + ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_GET_PARAMETER); + return 0; + } + if (sz == 0) { + ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_TAG); + return 0; + } + ctx->taglen = sz; + } + + return 1; +} + +static int aes_einit(void *ctx, const unsigned char *key, size_t keylen, + const unsigned char *iv, size_t ivlen, + const OSSL_PARAM params[]) +{ + if (!ossl_cipher_generic_einit(ctx, key, keylen, iv, ivlen, NULL)) + return 0; + return aes_set_ctx_params(ctx, params); +} + +static int aes_dinit(void *ctx, const unsigned char *key, size_t keylen, + const unsigned char *iv, size_t ivlen, + const OSSL_PARAM params[]) +{ + if (!ossl_cipher_generic_dinit(ctx, key, keylen, iv, ivlen, NULL)) + return 0; + return aes_set_ctx_params(ctx, params); +} + +static int aes_get_ctx_params(void *vctx, OSSL_PARAM params[]) +{ + PROV_AES_HMAC_SHA_ETM_CTX *ctx = (PROV_AES_HMAC_SHA_ETM_CTX *)vctx; + OSSL_PARAM *p; + size_t sz; + + p = OSSL_PARAM_locate(params, OSSL_CIPHER_PARAM_KEYLEN); + if (p != NULL && !OSSL_PARAM_set_size_t(p, ctx->base.keylen)) { + ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER); + return 0; + } + p = OSSL_PARAM_locate(params, OSSL_CIPHER_PARAM_IVLEN); + if (p != NULL && !OSSL_PARAM_set_size_t(p, ctx->base.ivlen)) { + ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER); + return 0; + } + p = OSSL_PARAM_locate(params, OSSL_CIPHER_PARAM_IV); + if (p != NULL + && !OSSL_PARAM_set_octet_string(p, ctx->base.oiv, ctx->base.ivlen) + && !OSSL_PARAM_set_octet_ptr(p, &ctx->base.oiv, ctx->base.ivlen)) { + ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER); + return 0; + } + p = OSSL_PARAM_locate(params, OSSL_CIPHER_PARAM_UPDATED_IV); + if (p != NULL + && !OSSL_PARAM_set_octet_string(p, ctx->base.iv, ctx->base.ivlen) + && !OSSL_PARAM_set_octet_ptr(p, &ctx->base.iv, ctx->base.ivlen)) { + ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER); + return 0; + } + p = OSSL_PARAM_locate(params, OSSL_CIPHER_HMAC_PARAM_MAC); + if (p != NULL) { + sz = p->data_size; + if (sz == 0 + || sz > AES_CBC_MAX_HMAC_SIZE + || !ctx->base.enc + || ctx->taglen == UNINITIALISED_SIZET) { + ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_TAG); + return 0; + } + if (!OSSL_PARAM_set_octet_string(p, ctx->tag, sz)) { + ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER); + return 0; + } + } + return 1; +} + +static const OSSL_PARAM cipher_aes_known_gettable_ctx_params[] = { + OSSL_PARAM_size_t(OSSL_CIPHER_PARAM_KEYLEN, NULL), + OSSL_PARAM_size_t(OSSL_CIPHER_PARAM_IVLEN, NULL), + OSSL_PARAM_octet_string(OSSL_CIPHER_PARAM_IV, NULL, 0), + OSSL_PARAM_octet_string(OSSL_CIPHER_PARAM_UPDATED_IV, NULL, 0), + OSSL_PARAM_END +}; + +const OSSL_PARAM *aes_gettable_ctx_params(ossl_unused void *cctx, + ossl_unused void *provctx) +{ + return cipher_aes_known_gettable_ctx_params; +} + +static const OSSL_PARAM cipher_aes_known_settable_ctx_params[] = { + OSSL_PARAM_octet_string(OSSL_CIPHER_PARAM_AEAD_MAC_KEY, NULL, 0), + OSSL_PARAM_octet_string(OSSL_CIPHER_PARAM_AEAD_TLS1_AAD, NULL, 0), + OSSL_PARAM_size_t(OSSL_CIPHER_PARAM_KEYLEN, NULL), + OSSL_PARAM_END +}; + +const OSSL_PARAM *aes_settable_ctx_params(ossl_unused void *cctx, + ossl_unused void *provctx) +{ + return cipher_aes_known_settable_ctx_params; +} + +static void base_ctx_init(void *provctx, PROV_AES_HMAC_SHA_ETM_CTX *ctx, + const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *meths, + size_t kbits, size_t blkbits, size_t ivbits, + uint64_t flags) +{ + ossl_cipher_generic_initkey(&ctx->base, kbits, blkbits, ivbits, + EVP_CIPH_CBC_MODE, flags, + &meths->base, provctx); + ctx->hw = (PROV_CIPHER_HW_AES_HMAC_SHA_ETM *)ctx->base.hw; +} + +static void *aes_cbc_hmac_sha1_etm_newctx(void *provctx, size_t kbits, + size_t blkbits, size_t ivbits, + uint64_t flags) +{ + PROV_AES_HMAC_SHA1_ETM_CTX *ctx; + + if (!ossl_prov_is_running()) + return NULL; + + ctx = OPENSSL_zalloc(sizeof(*ctx)); + if (ctx != NULL) + base_ctx_init(provctx, &ctx->base_ctx, + ossl_prov_cipher_hw_aes_cbc_hmac_sha1_etm(), kbits, blkbits, + ivbits, flags); + return ctx; +} + +static void aes_cbc_hmac_sha1_etm_freectx(void *vctx) +{ + PROV_AES_HMAC_SHA1_ETM_CTX *ctx = (PROV_AES_HMAC_SHA1_ETM_CTX *)vctx; + + if (ctx != NULL) { + ossl_cipher_generic_reset_ctx((PROV_CIPHER_CTX *)vctx); + OPENSSL_clear_free(ctx, sizeof(*ctx)); + } +} + +static void *aes_cbc_hmac_sha1_etm_dupctx(void *provctx) +{ + PROV_AES_HMAC_SHA1_ETM_CTX *ctx = provctx; + + if (ctx == NULL) + return NULL; + + return OPENSSL_memdup(ctx, sizeof(*ctx)); +} + +static void *aes_cbc_hmac_sha256_etm_newctx(void *provctx, size_t kbits, + size_t blkbits, size_t ivbits, + uint64_t flags) +{ + PROV_AES_HMAC_SHA256_ETM_CTX *ctx; + + if (!ossl_prov_is_running()) + return NULL; + + ctx = OPENSSL_zalloc(sizeof(*ctx)); + if (ctx != NULL) + base_ctx_init(provctx, &ctx->base_ctx, + ossl_prov_cipher_hw_aes_cbc_hmac_sha256_etm(), kbits, blkbits, + ivbits, flags); + return ctx; +} + +static void aes_cbc_hmac_sha256_etm_freectx(void *vctx) +{ + PROV_AES_HMAC_SHA256_ETM_CTX *ctx = (PROV_AES_HMAC_SHA256_ETM_CTX *)vctx; + + if (ctx != NULL) { + ossl_cipher_generic_reset_ctx((PROV_CIPHER_CTX *)vctx); + OPENSSL_clear_free(ctx, sizeof(*ctx)); + } +} + +static void *aes_cbc_hmac_sha256_etm_dupctx(void *provctx) +{ + PROV_AES_HMAC_SHA256_ETM_CTX *ctx = provctx; + + if (ctx == NULL) + return NULL; + + return OPENSSL_memdup(ctx, sizeof(*ctx)); +} + +# define IMPLEMENT_CIPHER(nm, sub, kbits, blkbits, ivbits, flags) \ +static OSSL_FUNC_cipher_newctx_fn nm##_##kbits##_##sub##_newctx; \ +static void *nm##_##kbits##_##sub##_newctx(void *provctx) \ +{ \ + return nm##_##sub##_newctx(provctx, kbits, blkbits, ivbits, flags); \ +} \ +static OSSL_FUNC_cipher_get_params_fn nm##_##kbits##_##sub##_get_params; \ +static int nm##_##kbits##_##sub##_get_params(OSSL_PARAM params[]) \ +{ \ + return ossl_cipher_generic_get_params(params, EVP_CIPH_CBC_MODE, \ + flags, kbits, blkbits, ivbits); \ +} \ +const OSSL_DISPATCH ossl_##nm##kbits##sub##_functions[] = { \ + { OSSL_FUNC_CIPHER_NEWCTX, (void (*)(void))nm##_##kbits##_##sub##_newctx },\ + { OSSL_FUNC_CIPHER_FREECTX, (void (*)(void))nm##_##sub##_freectx }, \ + { OSSL_FUNC_CIPHER_DUPCTX, (void (*)(void))nm##_##sub##_dupctx}, \ + { OSSL_FUNC_CIPHER_ENCRYPT_INIT, (void (*)(void))nm##_einit }, \ + { OSSL_FUNC_CIPHER_DECRYPT_INIT, (void (*)(void))nm##_dinit }, \ + { OSSL_FUNC_CIPHER_UPDATE, (void (*)(void))nm##_update }, \ + { OSSL_FUNC_CIPHER_FINAL, (void (*)(void))nm##_final }, \ + { OSSL_FUNC_CIPHER_CIPHER, (void (*)(void))nm##_cipher }, \ + { OSSL_FUNC_CIPHER_GET_PARAMS, \ + (void (*)(void))nm##_##kbits##_##sub##_get_params }, \ + { OSSL_FUNC_CIPHER_GETTABLE_PARAMS, \ + (void (*)(void))nm##_gettable_params }, \ + { OSSL_FUNC_CIPHER_GET_CTX_PARAMS, \ + (void (*)(void))nm##_get_ctx_params }, \ + { OSSL_FUNC_CIPHER_GETTABLE_CTX_PARAMS, \ + (void (*)(void))nm##_gettable_ctx_params }, \ + { OSSL_FUNC_CIPHER_SET_CTX_PARAMS, \ + (void (*)(void))nm##_set_ctx_params }, \ + { OSSL_FUNC_CIPHER_SETTABLE_CTX_PARAMS, \ + (void (*)(void))nm##_settable_ctx_params }, \ + OSSL_DISPATCH_END \ +}; +#endif /* AES_CBC_HMAC_SHA_ETM_CAPABLE */ + +/* ossl_aes128cbc_hmac_sha1_etm_functions */ +IMPLEMENT_CIPHER(aes, cbc_hmac_sha1_etm, 128, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC) +/* ossl_aes192cbc_hmac_sha1_etm_functions */ +IMPLEMENT_CIPHER(aes, cbc_hmac_sha1_etm, 192, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC) +/* ossl_aes256cbc_hmac_sha1_etm_functions */ +IMPLEMENT_CIPHER(aes, cbc_hmac_sha1_etm, 256, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC) +/* ossl_aes128cbc_hmac_sha256_etm_functions */ +IMPLEMENT_CIPHER(aes, cbc_hmac_sha256_etm, 128, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC) +/* ossl_aes192cbc_hmac_sha256_etm_functions */ +IMPLEMENT_CIPHER(aes, cbc_hmac_sha256_etm, 192, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC) +/* ossl_aes256cbc_hmac_sha256_etm_functions */ +IMPLEMENT_CIPHER(aes, cbc_hmac_sha256_etm, 256, 128, 128, EVP_CIPH_FLAG_ENC_THEN_MAC) \ No newline at end of file diff --git a/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha_etm.h b/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha_etm.h new file mode 100644 index 00000000000..c8b2b1e5ff9 --- /dev/null +++ b/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha_etm.h @@ -0,0 +1,64 @@ +/* + * Copyright 2024 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include +#include "prov/ciphercommon.h" +#include "crypto/aes_platform.h" + +int ossl_cipher_capable_aes_cbc_hmac_sha1_etm(void); +int ossl_cipher_capable_aes_cbc_hmac_sha256_etm(void); + +typedef struct prov_cipher_hw_aes_hmac_sha_ctx_etm_st { + PROV_CIPHER_HW base; /* must be first */ + void (*init_mac_key)(void *ctx, const unsigned char *inkey, size_t inlen); +} PROV_CIPHER_HW_AES_HMAC_SHA_ETM; + +const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *ossl_prov_cipher_hw_aes_cbc_hmac_sha1_etm(void); +const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *ossl_prov_cipher_hw_aes_cbc_hmac_sha256_etm(void); + +#ifdef AES_CBC_HMAC_SHA_ETM_CAPABLE +# include +# include + +# define AES_CBC_MAX_HMAC_SIZE 32 + +typedef struct prov_aes_hmac_sha_etm_ctx_st { + PROV_CIPHER_CTX base; + AES_KEY ks; + const PROV_CIPHER_HW_AES_HMAC_SHA_ETM *hw; + unsigned char tag[AES_CBC_MAX_HMAC_SIZE]; + unsigned char exp_tag[AES_CBC_MAX_HMAC_SIZE]; + size_t taglen; +} PROV_AES_HMAC_SHA_ETM_CTX; + +typedef struct prov_aes_hmac_sha1_etm_ctx_st { + PROV_AES_HMAC_SHA_ETM_CTX base_ctx; + SHA_CTX head, tail; +} PROV_AES_HMAC_SHA1_ETM_CTX; + +typedef struct prov_aes_hmac_sha256_etm_ctx_st { + PROV_AES_HMAC_SHA_ETM_CTX base_ctx; + SHA256_CTX head, tail; +} PROV_AES_HMAC_SHA256_ETM_CTX; + +typedef struct { + struct { + uint8_t *key; + uint8_t key_rounds; + uint8_t *iv; + } cipher; + struct { + struct { + uint8_t *i_key_pad; + uint8_t *o_key_pad; + } hmac; + } digest; +} CIPH_DIGEST; + +#endif /* AES_CBC_HMAC_SHA_ETM_CAPABLE */ diff --git a/providers/implementations/ciphers/ciphercommon.c b/providers/implementations/ciphers/ciphercommon.c index b1331b5b64f..3b09506447f 100644 --- a/providers/implementations/ciphers/ciphercommon.c +++ b/providers/implementations/ciphers/ciphercommon.c @@ -81,6 +81,12 @@ int ossl_cipher_generic_get_params(OSSL_PARAM params[], unsigned int md, ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER); return 0; } + p = OSSL_PARAM_locate(params, OSSL_CIPHER_PARAM_ENCRYPT_THEN_MAC); + if (p != NULL + && !OSSL_PARAM_set_int(p, (flags & EVP_CIPH_FLAG_ENC_THEN_MAC) != 0)) { + ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER); + return 0; + } p = OSSL_PARAM_locate(params, OSSL_CIPHER_PARAM_KEYLEN); if (p != NULL && !OSSL_PARAM_set_size_t(p, kbits / 8)) { ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER); diff --git a/providers/implementations/include/prov/implementations.h b/providers/implementations/include/prov/implementations.h index 35b0b0b9740..2b770badc69 100644 --- a/providers/implementations/include/prov/implementations.h +++ b/providers/implementations/include/prov/implementations.h @@ -98,6 +98,12 @@ extern const OSSL_DISPATCH ossl_aes256cbc_hmac_sha1_functions[]; extern const OSSL_DISPATCH ossl_aes128cbc_hmac_sha1_functions[]; extern const OSSL_DISPATCH ossl_aes256cbc_hmac_sha256_functions[]; extern const OSSL_DISPATCH ossl_aes128cbc_hmac_sha256_functions[]; +extern const OSSL_DISPATCH ossl_aes128cbc_hmac_sha1_etm_functions[]; +extern const OSSL_DISPATCH ossl_aes192cbc_hmac_sha1_etm_functions[]; +extern const OSSL_DISPATCH ossl_aes256cbc_hmac_sha1_etm_functions[]; +extern const OSSL_DISPATCH ossl_aes128cbc_hmac_sha256_etm_functions[]; +extern const OSSL_DISPATCH ossl_aes192cbc_hmac_sha256_etm_functions[]; +extern const OSSL_DISPATCH ossl_aes256cbc_hmac_sha256_etm_functions[]; #ifndef OPENSSL_NO_ARIA extern const OSSL_DISPATCH ossl_aria256gcm_functions[]; diff --git a/providers/implementations/include/prov/names.h b/providers/implementations/include/prov/names.h index 3b747ec92c0..19fdf635c07 100644 --- a/providers/implementations/include/prov/names.h +++ b/providers/implementations/include/prov/names.h @@ -211,6 +211,12 @@ #define PROV_NAMES_DES_CFB "DES-CFB:1.3.14.3.2.9" #define PROV_NAMES_DES_CFB1 "DES-CFB1" #define PROV_NAMES_DES_CFB8 "DES-CFB8" +#define PROV_NAMES_AES_128_CBC_HMAC_SHA1_ETM "AES-128-CBC-HMAC-SHA1-ETM" +#define PROV_NAMES_AES_192_CBC_HMAC_SHA1_ETM "AES-192-CBC-HMAC-SHA1-ETM" +#define PROV_NAMES_AES_256_CBC_HMAC_SHA1_ETM "AES-256-CBC-HMAC-SHA1-ETM" +#define PROV_NAMES_AES_128_CBC_HMAC_SHA256_ETM "AES-128-CBC-HMAC-SHA256-ETM" +#define PROV_NAMES_AES_192_CBC_HMAC_SHA256_ETM "AES-192-CBC-HMAC-SHA256-ETM" +#define PROV_NAMES_AES_256_CBC_HMAC_SHA256_ETM "AES-256-CBC-HMAC-SHA256-ETM" /*- * Digests diff --git a/test/evp_libctx_test.c b/test/evp_libctx_test.c index 039fca9bb09..ae03e589a03 100644 --- a/test/evp_libctx_test.c +++ b/test/evp_libctx_test.c @@ -487,7 +487,8 @@ static int test_cipher_reinit_partialupdate(int test_id) /* skip any ciphers that don't allow partial updates */ if (((EVP_CIPHER_get_flags(cipher) - & (EVP_CIPH_FLAG_CTS | EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK)) != 0) + & (EVP_CIPH_FLAG_CTS | EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK | + EVP_CIPH_FLAG_ENC_THEN_MAC)) != 0) || EVP_CIPHER_get_mode(cipher) == EVP_CIPH_CCM_MODE || EVP_CIPHER_get_mode(cipher) == EVP_CIPH_XTS_MODE || EVP_CIPHER_get_mode(cipher) == EVP_CIPH_WRAP_MODE) { diff --git a/test/evp_test.c b/test/evp_test.c index e34ea1d96e6..f45b7d81662 100644 --- a/test/evp_test.c +++ b/test/evp_test.c @@ -918,7 +918,8 @@ static int cipher_test_valid_fragmentation(CIPHER_DATA *cdat) || EVP_CIPHER_get_mode(cdat->cipher) == EVP_CIPH_SIV_MODE || EVP_CIPHER_get_mode(cdat->cipher) == EVP_CIPH_GCM_SIV_MODE || EVP_CIPHER_get_mode(cdat->cipher) == EVP_CIPH_XTS_MODE - || EVP_CIPHER_get_mode(cdat->cipher) == EVP_CIPH_WRAP_MODE) ? 0 : 1; + || EVP_CIPHER_get_mode(cdat->cipher) == EVP_CIPH_WRAP_MODE + || EVP_CIPHER_get_mode(cdat->cipher) == EVP_CIPH_CBC_MODE) ? 0 : 1; } static int cipher_test_init(EVP_TEST *t, const char *alg) @@ -1025,6 +1026,10 @@ static int cipher_test_parse(EVP_TEST *t, const char *keyword, cdat->key_bits = (size_t)i; return 1; } + if (strcmp(keyword, "Tag") == 0) + return parse_bin(value, &cdat->tag, &cdat->tag_len); + if (strcmp(keyword, "MACKey") == 0) + return parse_bin(value, &cdat->mac_key, &cdat->mac_key_len); if (cdat->aead) { int tls_aad = 0; @@ -1037,8 +1042,6 @@ static int cipher_test_parse(EVP_TEST *t, const char *keyword, } return -1; } - if (strcmp(keyword, "Tag") == 0) - return parse_bin(value, &cdat->tag, &cdat->tag_len); if (strcmp(keyword, "SetTagLate") == 0) { if (strcmp(value, "TRUE") == 0) cdat->tag_late = 1; @@ -1048,8 +1051,6 @@ static int cipher_test_parse(EVP_TEST *t, const char *keyword, return -1; return 1; } - if (strcmp(keyword, "MACKey") == 0) - return parse_bin(value, &cdat->mac_key, &cdat->mac_key_len); if (strcmp(keyword, "TLSVersion") == 0) { char *endptr; @@ -1349,6 +1350,12 @@ static int cipher_test_enc(EVP_TEST *t, int enc, size_t out_misalign, t->err = "TAG_SET_ERROR"; goto err; } + } else if (!enc && expected->mac_key && expected->tag) { + if (EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_AEAD_SET_TAG, + expected->tag_len, expected->tag) <= 0) { + t->err = "TAG_SET_ERROR"; + goto err; + } } if (expected->xts_standard != NULL) { OSSL_PARAM params[2]; @@ -1443,6 +1450,48 @@ static int cipher_test_enc(EVP_TEST *t, int enc, size_t out_misalign, rtag, expected->tag_len)) goto err; } + if (enc && expected->tag) { + if (EVP_CIPHER_is_a(expected->cipher, "AES-128-CBC-HMAC-SHA1-ETM") + || EVP_CIPHER_is_a(expected->cipher, "AES-128-CBC-HMAC-SHA256-ETM") + || EVP_CIPHER_is_a(expected->cipher, "AES-192-CBC-HMAC-SHA1-ETM") + || EVP_CIPHER_is_a(expected->cipher, "AES-192-CBC-HMAC-SHA256-ETM") + || EVP_CIPHER_is_a(expected->cipher, "AES-256-CBC-HMAC-SHA1-ETM") + || EVP_CIPHER_is_a(expected->cipher, "AES-256-CBC-HMAC-SHA256-ETM")) { + unsigned char rtag[32] = {0}; + unsigned tag_len = 0; + OSSL_PARAM params[2]; + + if (EVP_CIPHER_is_a(expected->cipher, "AES-128-CBC-HMAC-SHA1-ETM") + || EVP_CIPHER_is_a(expected->cipher, "AES-192-CBC-HMAC-SHA1-ETM") + || EVP_CIPHER_is_a(expected->cipher, "AES-256-CBC-HMAC-SHA1-ETM")) { + tag_len = 20; + } else if (EVP_CIPHER_is_a(expected->cipher, "AES-128-CBC-HMAC-SHA256-ETM") + || EVP_CIPHER_is_a(expected->cipher, "AES-192-CBC-HMAC-SHA256-ETM") + || EVP_CIPHER_is_a(expected->cipher, "AES-256-CBC-HMAC-SHA256-ETM")) { + tag_len = 32; + } + + if (!TEST_size_t_le(expected->tag_len, tag_len)) { + t->err = "TAG_LENGTH_INTERNAL_ERROR"; + goto err; + } + + params[0] = OSSL_PARAM_construct_octet_string(OSSL_CIPHER_HMAC_PARAM_MAC, + &rtag[0], + tag_len); + params[1] = OSSL_PARAM_construct_end(); + + if (!EVP_CIPHER_CTX_get_params(ctx, params)) { + t->err = "TAG_RETRIEVE_ERROR"; + goto err; + } + + if (!memory_err_compare(t, "TAG_VALUE_MISMATCH", + expected->tag, expected->tag_len, + rtag, expected->tag_len)) + goto err; + } + } /* Check the updated IV */ if (expected->next_iv != NULL) { /* Some (e.g., GCM) tests use IVs longer than EVP_MAX_IV_LENGTH. */ diff --git a/test/recipes/30-test_evp_data/evpciph_aes_stitched.txt b/test/recipes/30-test_evp_data/evpciph_aes_stitched.txt index ef2d1a27f98..06da481bb5b 100644 --- a/test/recipes/30-test_evp_data/evpciph_aes_stitched.txt +++ b/test/recipes/30-test_evp_data/evpciph_aes_stitched.txt @@ -124,3 +124,108 @@ TLSVersion = 0x0302 Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0ce83c1ff5f2680dc57ffd8eefdbb9c00d3d507672d105a990b2b78509978625b9d93c2bd41e3fb721abd1496553c583c67dad9b662b3d58c8540e10ed9c5ed1a7f33ce9e9a41c30836651d73ee2c003af03a919eb41a6d70ef814e184e740f8a96221b924d9d025ef5e7150d4ca76921a025dd146fef87da738877313f11ec8f4c558b878c28ce6a9a5011d70f58c5dbd3412cf0a32154f5a4286958a5a50a86f15119835ceccf432601e4cc688cdd682ac9620500b60c0760bb93209859823778a7f2b5bab1af259bda13d84f952af9d2f07f500dadedc41a2b6a737a1296e0b2fb96ac4da4bf71fe2f0c4a1b6fc4dd251087e4c03d2e28c85a9b4a835ef166b48e5b7690f332a1d8db7bd9380221891f31ee82f4b8dd9ebf540cab583a0f33 NextIV = 1f31ee82f4b8dd9ebf540cab583a0f33 + +Title = AES-128-CBC-HMAC-SHA1-ETM test vectors + +Cipher = AES-128-CBC-HMAC-SHA1-ETM +Key = feffe9928665731c6d6a8f9467308308 +MACKey = cafebabefacedbaddecaf88801020304 +IV = 101112131415161718191a1b1c1d1e1f +Plaintext = 000102030405060708090a0b0c0d0e0f +Ciphertext = 18bd54842828fdc0ac5a3b459f32f0be +Tag = 23f889888b834208235ad034ec087674f5d80a4a + +Cipher = AES-128-CBC-HMAC-SHA1-ETM +Key = feffe9928665731c6d6a8f9467308308 +MACKey = cafebabefacedbaddecaf88801020304 +IV = 101112131415161718191a1b1c1d1e1f +Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f +Ciphertext = 18bd54842828fdc0ac5a3b459f32f0be305a77944b17f62fedd4442ae60a0b0a3e1c2c23c584c86877fbd9997b415959254ea06ef046dc2e1fdafe7950a77ba94494683e01a0c495dc223a2de73be1474bcdf0b104a89ca6d419254e8f602334158d188f748c5cf4b7473c7475b4cf6c +Tag = 1b001f67b5438782bffb7febbca4ef4cca9f56ab + +Cipher = AES-128-CBC-HMAC-SHA1-ETM +Key = feffe9928665731c6d6a8f9467308308 +MACKey = cafebabefacedbaddecaf88801020304 +IV = 101112131415161718191a1b1c1d1e1f +Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f +Ciphertext = 18bd54842828fdc0ac5a3b459f32f0be305a77944b17f62fedd4442ae60a0b0a3e1c2c23c584c86877fbd9997b415959254ea06ef046dc2e1fdafe7950a77ba94494683e01a0c495dc223a2de73be1474bcdf0b104a89ca6d419254e8f602334158d188f748c5cf4b7473c7475b4cf6c3bfadb50a6126c4fe31d52606b97f347a9d6722a458cc2afdd895c3a247d11e551398180bc445b0ea94d17a1a441fb10b86d84a7549e03b6edf1a12591c63dfa167f2f11ea12b2d3d8f62d92be9238d1e6eed2099f3d0f9e1fe541618bbda588899002c3078202a2d138942c4325b673e494b310a502cda70e8f62480776c31068cb3d2f4c250b9e65669d950b1a4d50cf5f2b11c74960347885e8dbb89d58f24871c34f1a134b1873222b24a310f8bb3299ca1d16cb1921c97fb462e3150b57909ec7d376e93e52ea9e51094f22f11273c32403c82acebf575b7b7af7c98976adf6f4bd4199bd9201fa7321aaad828bfcc3785776f959484ff013d8a66d579af036a6c0e82d94e6eb773f6124f18da5ca4cf5b70f72e9d852766af78269d36a03eb2e2cdda79f16c0f81be27b6593c3f4e9d19cb7018a7e4ca74756dd66ac1b45a4d741e0431d120a7f84dbbc4d7d478b54464050e62d8da0c856ccbc2dcd4dec4aa4d554ac4cce8fbeca8ba4efb55a25771f425a6e5bd74c35972c3da41eeee7fb36b5075e5ab3115f7424f0dab05a085185e923d9ad3e74dc16ff2ecfe03afdf34ba17babafc65aa87600c632ccdcbcc1b591d723eb37a8a3f869cce9fe41 +Tag = 2824154e89a5867c46eebafb04ae3e3e4938f8bb + +Title = AES-256-CBC-HMAC-SHA1-ETM test vectors + +Cipher = AES-256-CBC-HMAC-SHA1-ETM +Key = 6cc028952fa7c1ee09fc78b7549ae04d79b54d40ec172333e3a4a2297b62afe5 +MACKey = cafebabefacedbaddecaf88801020304 +IV = 101112131415161718191a1b1c1d1e1f +Plaintext = 000102030405060708090a0b0c0d0e0f +Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0 +Tag = f7bce8a7f8a307b736f6fabb194fc29ceed3e0df + +Cipher = AES-256-CBC-HMAC-SHA1-ETM +Key = 6cc028952fa7c1ee09fc78b7549ae04d79b54d40ec172333e3a4a2297b62afe5 +MACKey = cafebabefacedbaddecaf88801020304 +IV = 101112131415161718191a1b1c1d1e1f +Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f +Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0ce83c1ff5f2680dc57ffd8eefdbb9c00d3d507672d105a990b2b78509978625b9d93c2bd41e3fb721abd1496553c583c67dad9b662b3d58c8540e10ed9c5ed1a7f33ce9e9a41c30836651d73ee2c003af03a919eb41a6d70ef814e184e740f8a +Tag = b86074c76249f1f058674c514dd52225c9bee36e + +Cipher = AES-256-CBC-HMAC-SHA1-ETM +Key = 6cc028952fa7c1ee09fc78b7549ae04d79b54d40ec172333e3a4a2297b62afe5 +MACKey = cafebabefacedbaddecaf88801020304 +IV = 101112131415161718191a1b1c1d1e1f +Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f +Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0ce83c1ff5f2680dc57ffd8eefdbb9c00d3d507672d105a990b2b78509978625b9d93c2bd41e3fb721abd1496553c583c67dad9b662b3d58c8540e10ed9c5ed1a7f33ce9e9a41c30836651d73ee2c003af03a919eb41a6d70ef814e184e740f8a4ca75016ae77ac335ba758396232a87ffceacf24a0e287371eaa04570cb68dcd61882e1c3f7aca38afed34138fedefe167bb9c741ebd14da2eba3cf5b9aa06bb93ca61fa462de7e1f439efac5ea55edab61171250be36da513e6b5f92c8267f778cdde5720128a586c7bbd5864686b12710daa9f133706e81fa3a066bd1f29277c08ca8f052b3ed06f04ec2a8509f54934fd9b06f4115e546011ff485ac76d5fce0329c94bf5f29726bed49ace94abf53b036c1f920f8c71d44deca7b11f653025698425717bb3cc8f5e74230d8ede675ee0eae6f8aae274152c7503c567427a71323feb84b0fc0515030c933e4c7399be13322b5d4ccabb97c011d75de82f38a540e972bc2a515dc31d50e78b74be891cc4a2ddbe4b50d0d27c069985a581b80a9f591a4bb198f085af2138ca9b4f595c37d60f15d960b1e39de7ff92a699d9aca4a44ff9d327c7130e6b0ce90032e358f3743d8abccaeb0426226d6ec233fdf289bdde5f3b2756a587a382e3353d77acb9774bd64978629633f2122d1fa376b12cfbe4781d6a35227d71fdfa929c1435596fbaf7fe0aea4fa02c6b9e8099c62149ed82819a2088b72660be8ea364c13d5340be93cab8ac92914d2b1115cbb7 +Tag = 8824b6ed9be82651706c292047c08269fd6c943b + +Title = AES-128-CBC-HMAC-SHA256-ETM test vectors + +Cipher = AES-128-CBC-HMAC-SHA256-ETM +Key = feffe9928665731c6d6a8f9467308308 +MACKey = cafebabefacedbaddecaf88801020304 +IV = 101112131415161718191a1b1c1d1e1f +Plaintext = 000102030405060708090a0b0c0d0e0f +Ciphertext = 18bd54842828fdc0ac5a3b459f32f0be +Tag = 3f1dd3b858ecc9d8beea6db830a1fe6f362b48909974d44fa0c9ef7d22e515e7 + +Cipher = AES-128-CBC-HMAC-SHA256-ETM +Key = feffe9928665731c6d6a8f9467308308 +MACKey = cafebabefacedbaddecaf88801020304 +IV = 101112131415161718191a1b1c1d1e1f +Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f +Ciphertext = 18bd54842828fdc0ac5a3b459f32f0be305a77944b17f62fedd4442ae60a0b0a3e1c2c23c584c86877fbd9997b415959254ea06ef046dc2e1fdafe7950a77ba94494683e01a0c495dc223a2de73be1474bcdf0b104a89ca6d419254e8f602334158d188f748c5cf4b7473c7475b4cf6c +Tag = 4cd35de98355fda7334262a3d9eb26d0cd4c4d9b2c58b5a107bd8728da18e6bb + +Cipher = AES-128-CBC-HMAC-SHA256-ETM +Key = feffe9928665731c6d6a8f9467308308 +MACKey = cafebabefacedbaddecaf88801020304 +IV = 101112131415161718191a1b1c1d1e1f +Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f +Ciphertext = 18bd54842828fdc0ac5a3b459f32f0be305a77944b17f62fedd4442ae60a0b0a3e1c2c23c584c86877fbd9997b415959254ea06ef046dc2e1fdafe7950a77ba94494683e01a0c495dc223a2de73be1474bcdf0b104a89ca6d419254e8f602334158d188f748c5cf4b7473c7475b4cf6c3bfadb50a6126c4fe31d52606b97f347a9d6722a458cc2afdd895c3a247d11e551398180bc445b0ea94d17a1a441fb10b86d84a7549e03b6edf1a12591c63dfa167f2f11ea12b2d3d8f62d92be9238d1e6eed2099f3d0f9e1fe541618bbda588899002c3078202a2d138942c4325b673e494b310a502cda70e8f62480776c31068cb3d2f4c250b9e65669d950b1a4d50cf5f2b11c74960347885e8dbb89d58f24871c34f1a134b1873222b24a310f8bb3299ca1d16cb1921c97fb462e3150b57909ec7d376e93e52ea9e51094f22f11273c32403c82acebf575b7b7af7c98976adf6f4bd4199bd9201fa7321aaad828bfcc3785776f959484ff013d8a66d579af036a6c0e82d94e6eb773f6124f18da5ca4cf5b70f72e9d852766af78269d36a03eb2e2cdda79f16c0f81be27b6593c3f4e9d19cb7018a7e4ca74756dd66ac1b45a4d741e0431d120a7f84dbbc4d7d478b54464050e62d8da0c856ccbc2dcd4dec4aa4d554ac4cce8fbeca8ba4efb55a25771f425a6e5bd74c35972c3da41eeee7fb36b5075e5ab3115f7424f0dab05a085185e923d9ad3e74dc16ff2ecfe03afdf34ba17babafc65aa87600c632ccdcbcc1b591d723eb37a8a3f869cce9fe41 +Tag = e23b2b23e56698fcbe2bde48035a863bb73a58e9e12d7e0de2fda0f82ff87676 + +Title = AES-256-CBC-HMAC-SHA256-ETM test vectors + +Cipher = AES-256-CBC-HMAC-SHA256-ETM +Key = 6cc028952fa7c1ee09fc78b7549ae04d79b54d40ec172333e3a4a2297b62afe5 +MACKey = cafebabefacedbaddecaf88801020304 +IV = 101112131415161718191a1b1c1d1e1f +Plaintext = 000102030405060708090a0b0c0d0e0f +Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0 +Tag = 38bc6d7930f516a29b17ede8388d42faa612b163021028b0d86b08c3b87cd31e + +Cipher = AES-256-CBC-HMAC-SHA256-ETM +Key = 6cc028952fa7c1ee09fc78b7549ae04d79b54d40ec172333e3a4a2297b62afe5 +MACKey = cafebabefacedbaddecaf88801020304 +IV = 101112131415161718191a1b1c1d1e1f +Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f +Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0ce83c1ff5f2680dc57ffd8eefdbb9c00d3d507672d105a990b2b78509978625b9d93c2bd41e3fb721abd1496553c583c67dad9b662b3d58c8540e10ed9c5ed1a7f33ce9e9a41c30836651d73ee2c003af03a919eb41a6d70ef814e184e740f8a +Tag = b3e7c334cf7906f5dfe8e20bb6a332578dd8e7fb688a7dcd299c1caba3fefbe5 + +Cipher = AES-256-CBC-HMAC-SHA256-ETM +Key = 6cc028952fa7c1ee09fc78b7549ae04d79b54d40ec172333e3a4a2297b62afe5 +MACKey = cafebabefacedbaddecaf88801020304 +IV = 101112131415161718191a1b1c1d1e1f +Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f +Ciphertext = 261cd0c88a4d4e6db7fc263257a9f6d0ce83c1ff5f2680dc57ffd8eefdbb9c00d3d507672d105a990b2b78509978625b9d93c2bd41e3fb721abd1496553c583c67dad9b662b3d58c8540e10ed9c5ed1a7f33ce9e9a41c30836651d73ee2c003af03a919eb41a6d70ef814e184e740f8a4ca75016ae77ac335ba758396232a87ffceacf24a0e287371eaa04570cb68dcd61882e1c3f7aca38afed34138fedefe167bb9c741ebd14da2eba3cf5b9aa06bb93ca61fa462de7e1f439efac5ea55edab61171250be36da513e6b5f92c8267f778cdde5720128a586c7bbd5864686b12710daa9f133706e81fa3a066bd1f29277c08ca8f052b3ed06f04ec2a8509f54934fd9b06f4115e546011ff485ac76d5fce0329c94bf5f29726bed49ace94abf53b036c1f920f8c71d44deca7b11f653025698425717bb3cc8f5e74230d8ede675ee0eae6f8aae274152c7503c567427a71323feb84b0fc0515030c933e4c7399be13322b5d4ccabb97c011d75de82f38a540e972bc2a515dc31d50e78b74be891cc4a2ddbe4b50d0d27c069985a581b80a9f591a4bb198f085af2138ca9b4f595c37d60f15d960b1e39de7ff92a699d9aca4a44ff9d327c7130e6b0ce90032e358f3743d8abccaeb0426226d6ec233fdf289bdde5f3b2756a587a382e3353d77acb9774bd64978629633f2122d1fa376b12cfbe4781d6a35227d71fdfa929c1435596fbaf7fe0aea4fa02c6b9e8099c62149ed82819a2088b72660be8ea364c13d5340be93cab8ac92914d2b1115cbb7 +Tag = 8cb8898a5b559984da3cbaa4703c9ed3cfc2f56c7292a3279a3dd5f7475412e1 + diff --git a/util/perl/OpenSSL/paramnames.pm b/util/perl/OpenSSL/paramnames.pm index 059b4897356..f9acf037640 100644 --- a/util/perl/OpenSSL/paramnames.pm +++ b/util/perl/OpenSSL/paramnames.pm @@ -150,6 +150,8 @@ my %params = ( 'CIPHER_PARAM_ALGORITHM_ID_PARAMS' => '*ALG_PARAM_ALGORITHM_ID_PARAMS', 'CIPHER_PARAM_ALGORITHM_ID_PARAMS_OLD' => "alg_id_param", # octet_string 'CIPHER_PARAM_XTS_STANDARD' => "xts_standard",# utf8_string + 'CIPHER_PARAM_ENCRYPT_THEN_MAC' => "encrypt-then-mac",# int, 0 or 1 + 'CIPHER_HMAC_PARAM_MAC' => "*CIPHER_PARAM_AEAD_TAG", 'CIPHER_PARAM_TLS1_MULTIBLOCK_MAX_SEND_FRAGMENT' => "tls1multi_maxsndfrag",# uint 'CIPHER_PARAM_TLS1_MULTIBLOCK_MAX_BUFSIZE' => "tls1multi_maxbufsz", # size_t -- 2.47.2