From: Stanciu, Adrian Date: Fri, 30 May 2025 16:17:26 +0000 (+0300) Subject: Add AES-CFB128 optimizations with Intel AVX-512 and VAES X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=055dd1d8bb24ba307981091524fdf06da3771641;p=thirdparty%2Fopenssl.git Add AES-CFB128 optimizations with Intel AVX-512 and VAES Reviewed-by: Neil Horman Reviewed-by: Shane Lontis (Merged from https://github.com/openssl/openssl/pull/26902) --- diff --git a/CHANGES.md b/CHANGES.md index 67f83c5528c..cf0aaa1a0c2 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -127,6 +127,12 @@ OpenSSL 3.6 *Daniel Van Geest (CryptoNext Security)* + * Added Intel AVX-512 and VAES optimizations for AES-CFB128 algorithms. + Encryption performance on large buffers improved by 1.5-1.7x, + while decryption speed increased by 20-23x. + + *Adrian Stanciu* + OpenSSL 3.5 ----------- diff --git a/crypto/aes/asm/aes-cfb-avx512.pl b/crypto/aes/asm/aes-cfb-avx512.pl new file mode 100644 index 00000000000..8136f16e55b --- /dev/null +++ b/crypto/aes/asm/aes-cfb-avx512.pl @@ -0,0 +1,1033 @@ +#! /usr/bin/env perl +# Copyright 2025 The OpenSSL Project Authors. All Rights Reserved. +# Copyright (c) 2025, Intel Corporation. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# Implements AES-CFB128 encryption and decryption with Intel(R) VAES + +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$avx512vaes=0; # will be non-zero if tooling supports Intel AVX-512 and VAES + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx512vaes = ($1>=2.30); +} + +if (!$avx512vaes && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { + $avx512vaes = ($1==2.13 && $2>=3) + ($1>=2.14); +} + +if (!$avx512vaes && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+\.[0-9]+)\./) { + $avx512vaes = ($1>=14.16); +} + +if (!$avx512vaes && `$ENV{CC} -v 2>&1` + =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) { + my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001 + if ($1) { + # Apple conditions, they use a different version series, see + # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2 + # clang 7.0.0 is Apple clang 10.0.1 + $avx512vaes = ($ver>=10.0001) + } else { + $avx512vaes = ($ver>=7.0); + } +} + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +################################################################## + +$code=".text\n"; + +if ($avx512vaes) { + +$code.=<<___; +.extern OPENSSL_ia32cap_P + +################################################################# +# Signature: +# +# int ossl_aes_cfb128_vaes_eligible(void); +# +# Detects if the underlying hardware supports all the features +# required to run the Intel AVX-512 implementations of AES-CFB128 algorithms. +# +# Returns: non zero if all the required features are detected, 0 otherwise +################################################################# + +.globl ossl_aes_cfb128_vaes_eligible +.type ossl_aes_cfb128_vaes_eligible,\@abi-omnipotent +.balign 64 + +ossl_aes_cfb128_vaes_eligible: +.cfi_startproc + endbranch + + mov OPENSSL_ia32cap_P+8(%rip),%ecx + xor %eax,%eax + + # Check 3rd 32-bit word of OPENSSL_ia32cap_P for the feature bit(s): + # AVX512BW (bit 30) + AVX512DQ (bit 17) + AVX512F (bit 16) + + and \$0x40030000,%ecx # mask is 1<<30|1<<17|1<<16 + cmp \$0x40030000,%ecx + jne .Laes_cfb128_vaes_eligible_done + + mov OPENSSL_ia32cap_P+12(%rip),%ecx + + # Check 4th 32-bit word of OPENSSL_ia32cap_P for the feature bit(s): + # AVX512VAES (bit 9) + + and \$0x200,%ecx # mask is 1<<9 + cmp \$0x200,%ecx + cmove %ecx,%eax + +.Laes_cfb128_vaes_eligible_done: + ret +.cfi_endproc +.size ossl_aes_cfb128_vaes_eligible, .-ossl_aes_cfb128_vaes_eligible +___ + +################################################################# +# +# AES subroutines for: +# - preloading the AES key schedule into AVX registers +# - single-block AES encryption used by CFB encryption and decryption +# - multiple-block AES encryption used by CFB decryption +# +# The CFB mode only uses block cipher encryption. +# +# The AES encryption step is described in Section 5.1 Cipher() of +# FIPS 197 https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197-upd1.pdf +# and implemented with Intel(R) AES-NI and VAES instructions: +# +# - AESKEYGENASSIST for key expansion, elsewhere in aesni_set_encrypt_key() +# - VPXORD for AES pre-whitening +# - VAESENC for performing one AES encryption round +# - VAESENCLAST for performing the last AES encryption round +# +# For more information please consult: +# - the Intel(R) 64 and IA-32 Architectures Optimization Reference Manual, +# Chapter 21: Cryptography & Finite Field Arithmetic Instructions +# https://www.intel.com/content/www/us/en/developer/articles/technical/intel64-and-ia32-architectures-optimization.html +# - the Intel(R) Advanced Encryption Standard (AES) New Instructions Set Whitepaper +# https://www.intel.com/content/dam/doc/white-paper/advanced-encryption-standard-new-instructions-set-paper.pdf +# +################################################################# + +# expects the key schedule address in $key_original +sub load_aes_key_schedule_1x() { +$code.=<<___; + vmovdqu8 0($key_original),%xmm17 # schedule 0 whitening + vmovdqu8 16($key_original),%xmm18 # 1 + vmovdqu8 32($key_original),%xmm19 # 2 + vmovdqu8 48($key_original),%xmm20 # 3 + vmovdqu8 64($key_original),%xmm21 # 4 + vmovdqu8 80($key_original),%xmm22 # 5 + vmovdqu8 96($key_original),%xmm23 # 6 + vmovdqu8 112($key_original),%xmm24 # 7 + vmovdqu8 128($key_original),%xmm25 # 8 + vmovdqu8 144($key_original),%xmm26 # 9 + vmovdqu8 160($key_original),%xmm27 # 10 last for AES-128 + vmovdqu8 176($key_original),%xmm28 # 11 + vmovdqu8 192($key_original),%xmm29 # 12 last for AES-192 + vmovdqu8 208($key_original),%xmm30 # 13 + vmovdqu8 224($key_original),%xmm31 # 14 last for AES-256 + + mov 240($key_original),$rounds # load AES rounds + # 240 is the byte-offset of the rounds field in AES_KEY +___ +} + + +# expects the key schedule address in $key_original +sub load_aes_key_schedule_4x() { +$code.=<<___; + vbroadcasti32x4 0($key_original),%zmm17 # schedule 0 whitening + vbroadcasti32x4 16($key_original),%zmm18 # 1 + vbroadcasti32x4 32($key_original),%zmm19 # 2 + vbroadcasti32x4 48($key_original),%zmm20 # 3 + vbroadcasti32x4 64($key_original),%zmm21 # 4 + vbroadcasti32x4 80($key_original),%zmm22 # 5 + vbroadcasti32x4 96($key_original),%zmm23 # 6 + vbroadcasti32x4 112($key_original),%zmm24 # 7 + vbroadcasti32x4 128($key_original),%zmm25 # 8 + vbroadcasti32x4 144($key_original),%zmm26 # 9 + vbroadcasti32x4 160($key_original),%zmm27 # 10 last for AES-128 + vbroadcasti32x4 176($key_original),%zmm28 # 11 + vbroadcasti32x4 192($key_original),%zmm29 # 12 last for AES-192 + vbroadcasti32x4 208($key_original),%zmm30 # 13 + vbroadcasti32x4 224($key_original),%zmm31 # 14 last for AES-256 + + mov 240($key_original),$rounds # load AES rounds + # 240 is the byte-offset of the rounds field in AES_KEY +___ +} + +# Performs AES encryption of 1 128-bit block +# Expects iv in $temp, non-final AES rounds in $rounds and key schedule in xmm17..31 +sub vaes_encrypt_block_1x() { + my ($label_prefix)=@_; +$code.=<<___; + vpxord %xmm17,$temp,$temp # AES pre-whitening + vaesenc %xmm18,$temp,$temp + vaesenc %xmm19,$temp,$temp + vaesenc %xmm20,$temp,$temp + vaesenc %xmm21,$temp,$temp + vaesenc %xmm22,$temp,$temp + vaesenc %xmm23,$temp,$temp + vaesenc %xmm24,$temp,$temp + vaesenc %xmm25,$temp,$temp + vaesenc %xmm26,$temp,$temp + + cmp \$0x09,$rounds + ja ${label_prefix}_192_256 + + vaesenclast %xmm27,$temp,$temp # last AES-128 encryption round + jmp ${label_prefix}_end + +.balign 32 +${label_prefix}_192_256: + + vaesenc %xmm27,$temp,$temp + vaesenc %xmm28,$temp,$temp + + cmp \$0x0B,$rounds + ja ${label_prefix}_256 + + vaesenclast %xmm29,$temp,$temp # last AES-192 encryption round + jmp ${label_prefix}_end + +.balign 32 +${label_prefix}_256: + + vaesenc %xmm29,$temp,$temp + vaesenc %xmm30,$temp,$temp + vaesenclast %xmm31,$temp,$temp # last AES-256 encryption round + +.balign 32 +${label_prefix}_end: +___ +} + + +# Performs parallel AES encryption of 4 128-bit blocks +# Expects iv in $temp_4x, non-final AES rounds in $rounds and key schedule in zmm17..31 +sub vaes_encrypt_block_4x() { + my ($label_prefix)=@_; +$code.=<<___; + vpxord %zmm17,$temp_4x,$temp_4x # AES pre-whitening + vaesenc %zmm18,$temp_4x,$temp_4x + vaesenc %zmm19,$temp_4x,$temp_4x + vaesenc %zmm20,$temp_4x,$temp_4x + vaesenc %zmm21,$temp_4x,$temp_4x + vaesenc %zmm22,$temp_4x,$temp_4x + vaesenc %zmm23,$temp_4x,$temp_4x + vaesenc %zmm24,$temp_4x,$temp_4x + vaesenc %zmm25,$temp_4x,$temp_4x + vaesenc %zmm26,$temp_4x,$temp_4x + + cmp \$0x09,$rounds + ja ${label_prefix}_192_256 + + vaesenclast %zmm27,$temp_4x,$temp_4x # last AES-128 encryption round + jmp ${label_prefix}_end + +.balign 32 +${label_prefix}_192_256: + + vaesenc %zmm27,$temp_4x,$temp_4x + vaesenc %zmm28,$temp_4x,$temp_4x + + cmp \$0x0B,$rounds + ja ${label_prefix}_256 + + vaesenclast %zmm29,$temp_4x,$temp_4x # last AES-192 encryption round + jmp ${label_prefix}_end + +.balign 32 +${label_prefix}_256: + + vaesenc %zmm29,$temp_4x,$temp_4x + vaesenc %zmm30,$temp_4x,$temp_4x + vaesenclast %zmm31,$temp_4x,$temp_4x # last AES-256 encryption round + +.balign 32 +${label_prefix}_end: +___ +} + + +# Performs parallel AES encryption of 16 128-bit blocks +# Expects input in $temp_*x, non-final AES rounds in $rounds and key schedule in zmm17..31 +sub vaes_encrypt_block_16x() { + my ($label_prefix)=@_; +$code.=<<___; + vpxord %zmm17,$temp_4x, $temp_4x # parallel AES pre-whitening + vpxord %zmm17,$temp_8x, $temp_8x + vpxord %zmm17,$temp_12x,$temp_12x + vpxord %zmm17,$temp_16x,$temp_16x + + vaesenc %zmm18,$temp_4x, $temp_4x + vaesenc %zmm18,$temp_8x, $temp_8x + vaesenc %zmm18,$temp_12x,$temp_12x + vaesenc %zmm18,$temp_16x,$temp_16x + + vaesenc %zmm19,$temp_4x, $temp_4x + vaesenc %zmm19,$temp_8x, $temp_8x + vaesenc %zmm19,$temp_12x,$temp_12x + vaesenc %zmm19,$temp_16x,$temp_16x + + vaesenc %zmm20,$temp_4x, $temp_4x + vaesenc %zmm20,$temp_8x, $temp_8x + vaesenc %zmm20,$temp_12x,$temp_12x + vaesenc %zmm20,$temp_16x,$temp_16x + + vaesenc %zmm21,$temp_4x, $temp_4x + vaesenc %zmm21,$temp_8x, $temp_8x + vaesenc %zmm21,$temp_12x,$temp_12x + vaesenc %zmm21,$temp_16x,$temp_16x + + vaesenc %zmm22,$temp_4x, $temp_4x + vaesenc %zmm22,$temp_8x, $temp_8x + vaesenc %zmm22,$temp_12x,$temp_12x + vaesenc %zmm22,$temp_16x,$temp_16x + + vaesenc %zmm23,$temp_4x, $temp_4x + vaesenc %zmm23,$temp_8x, $temp_8x + vaesenc %zmm23,$temp_12x,$temp_12x + vaesenc %zmm23,$temp_16x,$temp_16x + + vaesenc %zmm24,$temp_4x, $temp_4x + vaesenc %zmm24,$temp_8x, $temp_8x + vaesenc %zmm24,$temp_12x,$temp_12x + vaesenc %zmm24,$temp_16x,$temp_16x + + vaesenc %zmm25,$temp_4x, $temp_4x + vaesenc %zmm25,$temp_8x, $temp_8x + vaesenc %zmm25,$temp_12x,$temp_12x + vaesenc %zmm25,$temp_16x,$temp_16x + + vaesenc %zmm26,$temp_4x, $temp_4x + vaesenc %zmm26,$temp_8x, $temp_8x + vaesenc %zmm26,$temp_12x,$temp_12x + vaesenc %zmm26,$temp_16x,$temp_16x + + cmp \$0x09,$rounds + ja ${label_prefix}_192_256 + + vaesenclast %zmm27,$temp_4x, $temp_4x # last AES-128 encryption round + vaesenclast %zmm27,$temp_8x, $temp_8x + vaesenclast %zmm27,$temp_12x,$temp_12x + vaesenclast %zmm27,$temp_16x,$temp_16x + jmp ${label_prefix}_end + +.balign 32 +${label_prefix}_192_256: + + vaesenc %zmm27,$temp_4x, $temp_4x + vaesenc %zmm27,$temp_8x, $temp_8x + vaesenc %zmm27,$temp_12x,$temp_12x + vaesenc %zmm27,$temp_16x,$temp_16x + + vaesenc %zmm28,$temp_4x, $temp_4x + vaesenc %zmm28,$temp_8x, $temp_8x + vaesenc %zmm28,$temp_12x,$temp_12x + vaesenc %zmm28,$temp_16x,$temp_16x + + cmp \$0x0B,$rounds + ja ${label_prefix}_256 + + vaesenclast %zmm29,$temp_4x, $temp_4x # last AES-192 encryption round + vaesenclast %zmm29,$temp_8x, $temp_8x + vaesenclast %zmm29,$temp_12x,$temp_12x + vaesenclast %zmm29,$temp_16x,$temp_16x + jmp ${label_prefix}_end + +.balign 32 +${label_prefix}_256: + + vaesenc %zmm29,$temp_4x, $temp_4x + vaesenc %zmm29,$temp_8x, $temp_8x + vaesenc %zmm29,$temp_12x,$temp_12x + vaesenc %zmm29,$temp_16x,$temp_16x + + vaesenc %zmm30,$temp_4x, $temp_4x + vaesenc %zmm30,$temp_8x, $temp_8x + vaesenc %zmm30,$temp_12x,$temp_12x + vaesenc %zmm30,$temp_16x,$temp_16x + + vaesenclast %zmm31,$temp_4x, $temp_4x # last AES-256 encryption round + vaesenclast %zmm31,$temp_8x, $temp_8x + vaesenclast %zmm31,$temp_12x,$temp_12x + vaesenclast %zmm31,$temp_16x,$temp_16x + +.balign 32 +${label_prefix}_end: +___ +} + +################################################################# +# Signature: +# +# void ossl_aes_cfb128_vaes_enc( +# const unsigned char *in, +# unsigned char *out, +# size_t len, +# const AES_KEY *ks, +# const unsigned char ivec[16], +# /*in-out*/ ossl_ssize_t *num); +# +# Preconditions: +# - all pointers are valid (not NULL...) +# - AES key schedule and rounds in `ks` are precomputed +# +# Invariants: +# - `*num` is between 0 and 15 (inclusive) +# +################################################################# +# +# The implementation follows closely the encryption half of CRYPTO_cfb128_encrypt: +# - "pre" step: processes the last bytes of a partial block +# - "mid" step: processes complete blocks +# - "post" step: processes the first bytes of a partial block +# +# To obtain the next ciphertext block `cipher ` from +# the plaintext block `plain `, the previous ciphertext +# block `cipher ` is required as input. +# +# The dependency on previous encryption outputs (ciphertexts) +# makes CFB encryption inherently serial. +# +# +----+ +----------+ +# | iv | +---------------> cipher 0 | +# +--+-+ | +----------+ +# | | | +# | | | +# +------v------+ | +------v------+ +# | AES encrypt | | | AES encrypt | +# | with key | | | with key | +# +------+------+ | +------+------+ +# | | | +# | | | +# +---------+ +--v--+ | +---------+ +--v--+ +# | plain 0 +----> XOR | | | plain 1 +----> XOR | +# +---------+ +--+--+ | +---------+ +--+--+ +# | | | +# | | | +# +-----v----+ | +-----v----+ +# | cipher 0 +----+ | cipher 1 | +# +----------+ +----------+ +# +################################################################# + +$code.=<<___; +.globl ossl_aes_cfb128_vaes_enc +.type ossl_aes_cfb128_vaes_enc,\@function,6 +.balign 64 +ossl_aes_cfb128_vaes_enc: +.cfi_startproc + endbranch +___ + +$inp="%rdi"; # arg0 +$out="%rsi"; # arg1 +$len="%rdx"; # arg2 + +$key_original="%rcx"; # arg3 +$key_backup="%r10"; + +$ivp="%r8"; # arg4 +$nump="%r9"; # arg5 + +$num="%r11"; +$left="%rcx"; +$mask="%rax"; + +$rounds="%r11d"; + +$temp="%xmm2"; +$plain="%xmm3"; + +$code.=<<___; + + mov ($nump),$num # $num is the current byte index in the first partial block + # $num belongs to 0..15; non-zero means a partial first block + + test $len,$len # return early if $len==0, unlikely to occur + jz .Laes_cfb128_vaes_enc_done + + test $num,$num # check if the first block is partial + jz .Laes_cfb128_enc_mid # if not, jump to processing full blocks + +########################################################### +# first partial block pre-processing +########################################################### + + mov $key_original,$key_backup # make room for variable shl with cl + + mov \$0x10,$left # first block is partial + sub $num,$left # calculate how many bytes $left to process in the block + cmp $len,$left # + cmova $len,$left # $left = min(16-$num,$len) + + mov \$1,$mask # build a mask with the least significant $left bits set + shlq %cl,$mask # $left is left shift counter + dec $mask # $mask is 2^$left-1 + kmovq $mask,%k1 + + mov $num,%rax # keep in-out $num in %al + add $left,%rax # advance $num + and \$0x0F,%al # wrap-around $num in a 16-byte block + + leaq ($num,$ivp),%r11 # process $left iv bytes + vmovdqu8 (%r11),%xmm0 + vmovdqu8 ($inp),%xmm1 # process $left input bytes + vpxor %xmm0,%xmm1,%xmm2 # CipherFeedBack XOR + vmovdqu8 %xmm2,($out){%k1} # write $left output bytes + vmovdqu8 %xmm2,(%r11){%k1} # blend $left output bytes into iv + + add $left,$inp # advance pointers + add $left,$out + sub $left,$len + jz .Laes_cfb128_enc_zero_pre # return early if no AES encryption required + + mov $key_backup,$key_original # restore "key_original" as arg3 + +.Laes_cfb128_enc_mid: +___ + + &load_aes_key_schedule_1x(); + +$code.=<<___; +########################################################### +# inner full blocks processing +########################################################### + + vmovdqu ($ivp),$temp # load iv + + cmp \$0x10,$len # is there a full plaintext block left (128 bits) ? + jb .Laes_cfb128_enc_post + +.balign 32 +.Loop_aes_cfb128_enc_main: + sub \$0x10,$len + + vmovdqu ($inp),$plain # load plaintext block + lea 16($inp),$inp # $inp points to next plaintext +___ + + &vaes_encrypt_block_1x(".Laes_cfb128_enc_mid"); + +$code.=<<___; + + vpxor $plain,$temp,$temp # CipherFeedBack XOR + cmp \$0x10,$len + vmovdqu $temp,($out) # write ciphertext + lea 16($out),$out # $out points to the next output block + jae .Loop_aes_cfb128_enc_main + + xor %eax,%eax # reset num when processing full blocks + + vmovdqu $temp,($ivp) # latest ciphertext block is next encryption input + +.Laes_cfb128_enc_post: + +########################################################### +# last partial block post-processing +########################################################### + + test $len,$len # check if the last block is partial + jz .Laes_cfb128_enc_zero_all +___ + + &vaes_encrypt_block_1x(".Laes_cfb128_enc_post"); + +$code.=<<___; + + mov $len,%rax # num=$len + + mov \$1,%r11 # build a mask with the least significant $len bits set + mov %dl,%cl # $len is left shift counter less than 16 + shlq %cl,%r11 + dec %r11 # mask is 2^$len-1 + kmovq %r11,%k1 + + vmovdqu8 ($inp),%xmm1{%k1}{z} # read $len input bytes, zero the rest to not impact XOR + vpxor $temp,%xmm1,%xmm0 # CipherFeedBack XOR + vmovdqu8 %xmm0,($out){%k1} # write $len output bytes + vmovdqu8 %xmm0,($ivp) # write chained/streaming iv + + # clear registers + +.Laes_cfb128_enc_zero_all: + vpxord %xmm17,%xmm17,%xmm17 # clear the AES key schedule + vpxord %xmm18,%xmm18,%xmm18 + vpxord %xmm19,%xmm19,%xmm19 + vpxord %xmm20,%xmm20,%xmm20 + vpxord %xmm21,%xmm21,%xmm21 + vpxord %xmm22,%xmm22,%xmm22 + vpxord %xmm23,%xmm23,%xmm23 + vpxord %xmm24,%xmm24,%xmm24 + vpxord %xmm25,%xmm25,%xmm25 + vpxord %xmm26,%xmm26,%xmm26 + vpxord %xmm27,%xmm27,%xmm27 + vpxord %xmm28,%xmm28,%xmm28 + vpxord %xmm29,%xmm29,%xmm29 + vpxord %xmm30,%xmm30,%xmm30 + vpxord %xmm31,%xmm31,%xmm31 + + vpxor %xmm3,%xmm3,%xmm3 # clear registers used during AES encryption + +.Laes_cfb128_enc_zero_pre: + vpxor %xmm0,%xmm0,%xmm0 # clear the rest of the registers + vpxor %xmm1,%xmm1,%xmm1 + vpxor %xmm2,%xmm2,%xmm2 + + mov %rax,($nump) # num is in/out, update for future/chained calls + + vzeroupper + +.Laes_cfb128_vaes_enc_done: + ret +.cfi_endproc +.size ossl_aes_cfb128_vaes_enc,.-ossl_aes_cfb128_vaes_enc +___ + + +################################################################# +# Signature: +# +# void ossl_aes_cfb128_vaes_dec( +# const unsigned char *in, +# unsigned char *out, +# size_t len, +# const AES_KEY *ks, +# const unsigned char ivec[16], +# /*in-out*/ ossl_ssize_t *num); +# +# Preconditions: +# - all pointers are valid (not NULL...) +# - AES key schedule and rounds in `ks` are precomputed +# +# Invariants: +# - `*num` is between 0 and 15 (inclusive) +# +################################################################# +# +# The implementation follows closely the decryption half of CRYPTO_cfb128_encrypt: +# +# - "pre" step: processes the last bytes of a partial block +# - "mid" step: processes complete blocks using an unrolled approach: +# - processes 16 blocks in parallel until fewer than 16 blocks remain +# - processes 4 blocks in parallel until fewer than 4 blocks remain +# - processes 1 block in series until none are left +# - "post" step: processes the first bytes of a partial block +# +# To obtain the next plaintext block `plain ` from +# its ciphertext block `cipher `, the previous ciphertext +# block `cipher ` is required as input. +# +# Since CFB decryption for the current block only depends on +# iv and ciphertext blocks (already available as inputs) +# and not on plaintext blocks, it can be efficiently parallelized. +# +# +----+ +----------+ +----------+ +----------+ +# | iv | | cipher 0 | | cipher 1 | | cipher 2 | +# +--+-+ +----+-----+ +----+-----+ +----+-----+ +# | | | | +# | | | | +# +------v------+ +------v------+ +------v------+ +------v------+ +# | AES encrypt | | AES encrypt | | AES encrypt | | AES encrypt | +# | with key | | with key | | with key | | with key | +# +------+------+ +------+------+ +------+------+ +------+------+ +# | | | | +# | | | | +# +--v--+ +----------+ +--v--+ +----------+ +--v--+ +----------+ +--v--+ +----------+ +# | XOR <-----+ cipher 0 | | XOR <-----+ cipher 1 | | XOR <-----+ cipher 2 | | XOR <-----+ cipher 3 | +# +--+--+ +----------+ +--+--+ +----------+ +--+--+ +----------+ +--+--+ +----------+ +# | | | | +# | | | | +# +----v----+ +----v----+ +----v----+ +----v----+ +# | plain 0 | | plain 1 | | plain 2 | | plain 3 | +# +---------+ +---------+ +---------+ +---------+ +# +# To produce N (4 in the diagram above) output/plaintext blocks we require as inputs: +# - iv +# - N ciphertext blocks +# The N-th ciphertext block is not encrypted and becomes the next iv input. +# +################################################################# + +$code.=<<___; +.globl ossl_aes_cfb128_vaes_dec +.type ossl_aes_cfb128_vaes_dec,\@function,6 +.balign 64 +ossl_aes_cfb128_vaes_dec: +.cfi_startproc + endbranch +___ + +$inp="%rdi"; # arg0 +$out="%rsi"; # arg1 +$len="%rdx"; # arg2 + +$key_original="%rcx"; # arg3 +$key_backup="%r10"; + +$ivp="%r8"; # arg4 +$nump="%r9"; # arg5 + +$num="%r11"; +$left="%rcx"; +$mask="%rax"; + +$rounds="%r11d"; + +$temp="%xmm2"; +$temp_4x="%zmm2"; +$temp_8x="%zmm4"; +$temp_12x="%zmm0"; +$temp_16x="%zmm6"; + +$cipher="%xmm3"; +$cipher_4x="%zmm3"; +$cipher_8x="%zmm5"; +$cipher_12x="%zmm1"; +$cipher_16x="%zmm16"; + +$code.=<<___; + + mov ($nump),$num # $num is the current byte index in the first partial block + # $num belongs to 0..15; non-zero means a partial first block + + test $len,$len # return early if $len==0, unlikely to occur + jz .Laes_cfb128_vaes_dec_done +___ + +$code.=<<___ if($win64); + sub \$0x10,%rsp +.cfi_adjust_cfa_offset 16 + vmovdqu %xmm6,(%rsp) # xmm6 needs to be maintained for Windows +___ + +$code.=<<___; + test $num,$num # check if the first block is partial + jz .Laes_cfb128_dec_mid # if not, jump to processing full blocks + +########################################################### +# first partial block pre-processing +########################################################### + + mov $key_original,$key_backup # make room for variable shl with cl + + mov \$0x10,$left # first block is partial + sub $num,$left # calculate how many bytes $left to process in the block + cmp $len,$left # + cmova $len,$left # $left = min(16-$num,$len) + + mov \$1,$mask # build a mask with the least significant $left bits set + shlq %cl,$mask # $left is left shift counter + dec $mask # $mask is 2^$left-1 + kmovq $mask,%k1 + + lea ($num,$left),%rax # keep in-out num in %al, advance by $left + and \$0x0F,%al # wrap-around in a 16-byte block + + leaq ($num,$ivp),%r11 # process $left iv bytes + vmovdqu8 (%r11),%xmm0 + vmovdqu8 ($inp),%xmm1 # process $left input bytes + vpxor %xmm0,%xmm1,%xmm2 # CipherFeedBack XOR + vmovdqu8 %xmm2,($out){%k1} # write $left output bytes + vmovdqu8 %xmm1,(%r11){%k1} # blend $left input bytes into iv + + add $left,$inp # advance pointers + add $left,$out + sub $left,$len + jz .Laes_cfb128_dec_zero_pre # return early if no AES encryption required + + mov $key_backup,$key_original # restore "key_original" as arg3 + +.Laes_cfb128_dec_mid: +___ + + &load_aes_key_schedule_4x(); + +$code.=<<___; +########################################################### +# inner full blocks processing +########################################################### + + # $temp_4x is "iv | iv | iv | iv" + vbroadcasti32x4 ($ivp),$temp_4x # load iv + + cmp \$0x100,$len # are there 16 ciphertext blocks left (2048 bits) ? + jb .Laes_cfb128_dec_check_4x + +########################################################### +# decrypt groups of 16 128-bit blocks in parallel +# behaves as 16x loop unroll +########################################################### + +.balign 32 +.Loop_aes_cfb128_dec_mid_16x: + sub \$0x100,$len + + # load 16 ciphertext blocks + + # $cipher_4x is "ciphertext 0 | ciphertext 1 | ciphertext 2 | ciphertext 3" + vmovdqu32 ($inp),$cipher_4x + # $cipher_8x is "ciphertext 4 | ciphertext 5 | ciphertext 6 | ciphertext 7" + vmovdqu32 64($inp),$cipher_8x + # $cipher_12x is "ciphertext 8 | ciphertext 9 | ciphertext 10 | ciphertext 11" + vmovdqu32 128($inp),$cipher_12x + # $cipher_16x is "ciphertext 12 | ciphertext 13 | ciphertext 14 | ciphertext 15" + vmovdqu32 192($inp),$cipher_16x + + # $temp_4x is "iv | ciphertext 0 | ciphertext 1 | ciphertext 2" + valignq \$6,$temp_4x,$cipher_4x,$temp_4x + # $temp_8x is "ciphertext 3 | ciphertext 4 | ciphertext 5 | ciphertext 6" + valignq \$6,$cipher_4x,$cipher_8x,$temp_8x + # $temp_12x is "ciphertext 7 | ciphertext 8 | ciphertext 9 | ciphertext 10" + valignq \$6,$cipher_8x,$cipher_12x,$temp_12x + # $temp_16x is "ciphertext 11 | ciphertext 12 | ciphertext 13 | ciphertext 14" + valignq \$6,$cipher_12x,$cipher_16x,$temp_16x + + lea 256($inp),$inp # $inp points to next ciphertext +___ + + &vaes_encrypt_block_16x(".Laes_cfb128_dec_mid_16x"); + +$code.=<<___; + + vpxord $cipher_4x,$temp_4x,$temp_4x # CipherFeedBack XOR of 16 blocks + vpxord $cipher_8x,$temp_8x,$temp_8x + vpxord $cipher_12x,$temp_12x,$temp_12x + vpxord $cipher_16x,$temp_16x,$temp_16x + + cmp \$0x100,$len + + vmovdqu32 $temp_4x,($out) # write 16 plaintext blocks + vmovdqu32 $temp_8x,64($out) + vmovdqu32 $temp_12x,128($out) + vmovdqu32 $temp_16x,192($out) + + vmovdqu8 $cipher_16x,$temp_4x + + lea 256($out),$out # $out points to the next output block + + jae .Loop_aes_cfb128_dec_mid_16x + + vextracti64x2 \$3,$cipher_16x,$temp # latest ciphertext block is next decryption iv + vinserti32x4 \$3,$temp,$temp_4x,$temp_4x # move ciphertext3 to positions 0 and 3 in preparation for next shuffle + + xor %eax,%eax # reset $num when processing full blocks + + vmovdqu $temp,($ivp) # latest plaintext block is next decryption iv + +.Laes_cfb128_dec_check_4x: + cmp \$0x40,$len # are there 4 ciphertext blocks left (512 bits) ? + jb .Laes_cfb128_dec_check_1x + +########################################################### +# decrypt groups of 4 128-bit blocks in parallel +# behaves as 4x loop unroll +########################################################### + +# expects $temp_4x to contain "iv" in the 3rd (most significant) lane + +.balign 32 +.Loop_aes_cfb128_dec_mid_4x: + sub \$0x40,$len + + # $cipher_4x is "ciphertext0 | ciphertext1 | ciphertext 2 | ciphertext 3" + vmovdqu32 ($inp),$cipher_4x # load 4 ciphertext blocks + + # $temp_4x is "iv | ciphertext0 | ciphertext 1 | ciphertext 2" + valignq \$6,$temp_4x,$cipher_4x,$temp_4x + + lea 64($inp),$inp # $inp points to next ciphertext +___ + + &vaes_encrypt_block_4x(".Laes_cfb128_dec_mid_4x"); + +$code.=<<___; + vpxord $cipher_4x,$temp_4x,$temp_4x # CipherFeedBack XOR of 4 blocks + cmp \$0x40,$len + vmovdqu32 $temp_4x,($out) # write 4 plaintext blocks + vmovdqu8 $cipher_4x,$temp_4x + lea 64($out),$out # $out points to the next output block + + jae .Loop_aes_cfb128_dec_mid_4x + + vextracti64x2 \$3,$temp_4x,$temp # latest ciphertext block is next decryption iv + # move ciphertext3 to position 0 in preparation for next step + + xor %eax,%eax # reset $num when processing full blocks + + vmovdqu $temp,($ivp) # latest plaintext block is next decryption iv + +.Laes_cfb128_dec_check_1x: + cmp \$0x10,$len # are there full ciphertext blocks left (128 bits) ? + jb .Laes_cfb128_dec_post + +########################################################### +# decrypt the rest of full 128-bit blocks in series +########################################################### + +# expects $temp to contain iv + +.balign 32 +.Loop_aes_cfb128_dec_mid_1x: + sub \$0x10,$len + + vmovdqu ($inp),$cipher # load ciphertext block + lea 16($inp),$inp # $inp points to next ciphertext +___ + + &vaes_encrypt_block_1x(".Loop_aes_cfb128_dec_mid_1x_inner"); + +$code.=<<___; + vpxor $cipher,$temp,$temp # CipherFeedBack XOR + cmp \$0x10,$len + vmovdqu $temp,($out) # write plaintext + vmovdqu8 $cipher,$temp + lea 16($out),$out # $out points to the next output block + jae .Loop_aes_cfb128_dec_mid_1x + + xor %eax,%eax # reset $num when processing full blocks + + vmovdqu $temp,($ivp) # latest plaintext block is next decryption input + +.Laes_cfb128_dec_post: + +########################################################### +# last partial block post-processing +########################################################### + + test $len,$len # check if the last block is partial + jz .Laes_cfb128_dec_zero_all +___ + + &vaes_encrypt_block_1x(".Loop_aes_cfb128_dec_post"); + +$code.=<<___; + + mov $len,%rax # num=$len + mov \$1,%r11 # build a mask with the least significant $len bits set + mov %dl,%cl # $len is left shift counter less than 16 + shlq %cl,%r11 + dec %r11 # mask is 2^$len-1 + kmovq %r11,%k1 + + vmovdqu8 ($inp),%xmm1{%k1}{z} # read $len input bytes, zero the rest to not impact XOR + vpxor $temp,%xmm1,%xmm0 # CipherFeedBack XOR + vmovdqu8 %xmm0,($out){%k1} # write $len output bytes + vpblendmb %xmm1,$temp,${temp}{%k1} # blend $len input bytes into iv + + vmovdqu8 $temp,($ivp) # write chained/streaming iv + + # clear registers + +.Laes_cfb128_dec_zero_all: + vpxord %xmm17,%xmm17,%xmm17 # clear the AES key schedule + vpxord %xmm18,%xmm18,%xmm18 # zero the upper lanes of zmm registers + vpxord %xmm19,%xmm19,%xmm19 + vpxord %xmm20,%xmm20,%xmm20 + vpxord %xmm21,%xmm21,%xmm21 + vpxord %xmm22,%xmm22,%xmm22 + vpxord %xmm23,%xmm23,%xmm23 + vpxord %xmm24,%xmm24,%xmm24 + vpxord %xmm25,%xmm25,%xmm25 + vpxord %xmm26,%xmm26,%xmm26 + vpxord %xmm27,%xmm27,%xmm27 + vpxord %xmm28,%xmm28,%xmm28 + vpxord %xmm29,%xmm29,%xmm29 + vpxord %xmm30,%xmm30,%xmm30 + vpxord %xmm31,%xmm31,%xmm31 + + vpxord %xmm3,%xmm3,%xmm3 # clear registers used during AES encryption + vpxord %xmm4,%xmm4,%xmm4 + vpxord %xmm5,%xmm5,%xmm5 + vpxord %xmm6,%xmm6,%xmm6 + vpxord %xmm16,%xmm16,%xmm16 + +.Laes_cfb128_dec_zero_pre: + + vpxord %xmm0,%xmm0,%xmm0 # clear the rest of the registers + vpxord %xmm1,%xmm1,%xmm1 + vpxord %xmm2,%xmm2,%xmm2 + + vzeroupper +___ + +$code.=<<___ if($win64); + vmovdqu (%rsp),%xmm6 # xmm6 needs to be maintained for Windows + add \$16,%rsp +.cfi_adjust_cfa_offset -16 +___ + +$code.=<<___; + mov %rax,($nump) # num is in/out, update for future/chained calls + +.Laes_cfb128_vaes_dec_done: + ret +.cfi_endproc +.size ossl_aes_cfb128_vaes_dec,.-ossl_aes_cfb128_vaes_dec +___ + +} else { + +$code .= <<___; +.globl ossl_aes_cfb128_vaes_enc +.globl ossl_aes_cfb128_vaes_dec + +# Mock implementations of AES-CFB128 encryption/decryption +# that always fail. Should not be executed under normal circumstances. + +ossl_aes_cfb128_vaes_enc: +ossl_aes_cfb128_vaes_dec: + .byte 0x0f,0x0b # Undefined Instruction in the Intel architecture + # Raises the Invalid Opcode exception + ret + +################################################################# +# Signature: +# +# int ossl_aes_cfb128_vaes_eligible(void); +# +# Always returns 0 (not eligible), meaning that tooling does not support +# the Intel AVX-512 extensions. Signals higher level code to fallback +# to an alternative implementation. +################################################################# + +.globl ossl_aes_cfb128_vaes_eligible +.type ossl_aes_cfb128_vaes_eligible,\@abi-omnipotent +ossl_aes_cfb128_vaes_eligible: + xor %eax,%eax + ret +.size ossl_aes_cfb128_vaes_eligible, .-ossl_aes_cfb128_vaes_eligible +___ +} + +print $code; + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/aes/build.info b/crypto/aes/build.info index ed79316b006..cb3bb5f2deb 100644 --- a/crypto/aes/build.info +++ b/crypto/aes/build.info @@ -10,7 +10,7 @@ IF[{- !$disabled{asm} -}] $AESASM_x86_64=\ aes-x86_64.s vpaes-x86_64.s bsaes-x86_64.s aesni-x86_64.s \ aesni-sha1-x86_64.s aesni-sha256-x86_64.s aesni-mb-x86_64.s \ - aesni-xts-avx512.s + aesni-xts-avx512.s aes-cfb-avx512.s $AESDEF_x86_64=AES_ASM VPAES_ASM BSAES_ASM $AESASM_ia64=aes_core.c aes_cbc.c aes-ia64.s @@ -107,6 +107,7 @@ GENERATE[aes-x86_64.s]=asm/aes-x86_64.pl GENERATE[vpaes-x86_64.s]=asm/vpaes-x86_64.pl GENERATE[bsaes-x86_64.s]=asm/bsaes-x86_64.pl GENERATE[aesni-x86_64.s]=asm/aesni-x86_64.pl +GENERATE[aes-cfb-avx512.s]=asm/aes-cfb-avx512.pl GENERATE[aesni-sha1-x86_64.s]=asm/aesni-sha1-x86_64.pl GENERATE[aesni-sha256-x86_64.s]=asm/aesni-sha256-x86_64.pl GENERATE[aesni-mb-x86_64.s]=asm/aesni-mb-x86_64.pl diff --git a/crypto/perlasm/x86_64-xlate.pl b/crypto/perlasm/x86_64-xlate.pl index 5de98e65393..6ee56b874b7 100755 --- a/crypto/perlasm/x86_64-xlate.pl +++ b/crypto/perlasm/x86_64-xlate.pl @@ -440,6 +440,7 @@ my %globals; ($self->{asterisk}) && ($sz="q") || ($mnemonic =~ /^v?mov([qd])$/) && ($sz=$1) || ($mnemonic =~ /^v?pinsr([qdwb])$/) && ($sz=$1) || + ($mnemonic =~ /^vbroadcasti32x4$/) && ($sz="x") || ($mnemonic =~ /^vpbroadcast([qdwb])$/) && ($sz=$1) || ($mnemonic =~ /^v(?!perm)[a-z]+[fi]128$/) && ($sz="x"); diff --git a/include/crypto/aes_platform.h b/include/crypto/aes_platform.h index bdd51976a7a..91eccc4ba45 100644 --- a/include/crypto/aes_platform.h +++ b/include/crypto/aes_platform.h @@ -208,6 +208,14 @@ int aesni_set_encrypt_key(const unsigned char *userKey, int bits, int aesni_set_decrypt_key(const unsigned char *userKey, int bits, AES_KEY *key); +void ossl_aes_cfb128_vaes_enc(const unsigned char *in, unsigned char *out, + size_t len, const AES_KEY *ks, + const unsigned char ivec[16], ossl_ssize_t *num); +void ossl_aes_cfb128_vaes_dec(const unsigned char *in, unsigned char *out, + size_t len, const AES_KEY *ks, + const unsigned char ivec[16], ossl_ssize_t *num); +int ossl_aes_cfb128_vaes_eligible(void); + void aesni_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key); void aesni_decrypt(const unsigned char *in, unsigned char *out, diff --git a/providers/implementations/ciphers/cipher_aes_cfb_hw_aesni.inc b/providers/implementations/ciphers/cipher_aes_cfb_hw_aesni.inc index fa0289a8b20..fae3f8a8409 100644 --- a/providers/implementations/ciphers/cipher_aes_cfb_hw_aesni.inc +++ b/providers/implementations/ciphers/cipher_aes_cfb_hw_aesni.inc @@ -8,14 +8,56 @@ */ /*- - * AES-NI support for AES mode cfb. + * AES-NI and VAES support for AES CFB mode. * This file is included by cipher_aes_cfb_hw.c */ +#if (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)) + #define cipher_hw_vaes_cfb128 aes_cfb128_vaes_encdec_wrapper +#else + #define cipher_hw_vaes_cfb128 ossl_cipher_hw_generic_cfb128 + int ossl_aes_cfb128_vaes_eligible() { + return 0; + } +#endif +#define cipher_hw_vaes_cfb8 ossl_cipher_hw_generic_cfb8 +#define cipher_hw_vaes_cfb1 ossl_cipher_hw_generic_cfb1 + #define cipher_hw_aesni_cfb128 ossl_cipher_hw_generic_cfb128 #define cipher_hw_aesni_cfb8 ossl_cipher_hw_generic_cfb8 #define cipher_hw_aesni_cfb1 ossl_cipher_hw_generic_cfb1 +static int ossl_aes_cfb8_vaes_eligible(void) { return 0; } +static int ossl_aes_cfb1_vaes_eligible(void) { return 0; } + +/* active in 64-bit builds when AES-NI, AVX512F, and VAES are detected */ +static int aes_cfb128_vaes_encdec_wrapper( + PROV_CIPHER_CTX* dat, + unsigned char *out, + const unsigned char *in, + size_t len) +{ + ossl_ssize_t num; + + num = (ossl_ssize_t)dat->num; + + if (num < 0) { + /* behavior from CRYPTO_cfb128_encrypt */ + dat->num = -1; + return 1; + } + + if (dat->enc) + ossl_aes_cfb128_vaes_enc(in, out, len, dat->ks, dat->iv, &num); + else + ossl_aes_cfb128_vaes_dec(in, out, len, dat->ks, dat->iv, &num); + + dat->num = (int)num; + + return 1; +} + +/* generates AES round keys for AES-NI and VAES implementations */ static int cipher_hw_aesni_initkey(PROV_CIPHER_CTX *dat, const unsigned char *key, size_t keylen) { @@ -43,7 +85,15 @@ static const PROV_CIPHER_HW aesni_##mode = { \ cipher_hw_aesni_initkey, \ cipher_hw_aesni_##mode, \ cipher_hw_aes_copyctx \ +}; \ +static const PROV_CIPHER_HW vaes_##mode = { \ + cipher_hw_aesni_initkey, \ + cipher_hw_vaes_##mode, \ + cipher_hw_aes_copyctx \ }; #define PROV_CIPHER_HW_select(mode) \ -if (AESNI_CAPABLE) \ - return &aesni_##mode; +if (AESNI_CAPABLE) { \ + if (ossl_aes_##mode##_vaes_eligible()) \ + return &vaes_##mode; \ + return &aesni_##mode; \ +}