From: Stanciu, Adrian <adrian.stanciu@intel.com>
Date: Fri, 30 May 2025 16:17:26 +0000 (+0300)
Subject: Add AES-CFB128 optimizations with Intel AVX-512 and VAES
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=055dd1d8bb24ba307981091524fdf06da3771641;p=thirdparty%2Fopenssl.git

Add AES-CFB128 optimizations with Intel AVX-512 and VAES

Reviewed-by: Neil Horman <nhorman@openssl.org>
Reviewed-by: Shane Lontis <shane.lontis@oracle.com>
(Merged from https://github.com/openssl/openssl/pull/26902)
---

diff --git a/CHANGES.md b/CHANGES.md
index 67f83c5528c..cf0aaa1a0c2 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -127,6 +127,12 @@ OpenSSL 3.6
 
    *Daniel Van Geest (CryptoNext Security)*
 
+ * Added Intel AVX-512 and VAES optimizations for AES-CFB128 algorithms.
+   Encryption performance on large buffers improved by 1.5-1.7x,
+   while decryption speed increased by 20-23x.
+
+   *Adrian Stanciu*
+
 OpenSSL 3.5
 -----------
 
diff --git a/crypto/aes/asm/aes-cfb-avx512.pl b/crypto/aes/asm/aes-cfb-avx512.pl
new file mode 100644
index 00000000000..8136f16e55b
--- /dev/null
+++ b/crypto/aes/asm/aes-cfb-avx512.pl
@@ -0,0 +1,1033 @@
+#! /usr/bin/env perl
+# Copyright 2025 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2025, Intel Corporation. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# Implements AES-CFB128 encryption and decryption with Intel(R) VAES
+
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$avx512vaes=0; # will be non-zero if tooling supports Intel AVX-512 and VAES
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+        =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+    $avx512vaes = ($1>=2.30);
+}
+
+if (!$avx512vaes && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+       `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
+    $avx512vaes = ($1==2.13 && $2>=3) + ($1>=2.14);
+}
+
+if (!$avx512vaes && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+       `ml64 2>&1` =~ /Version ([0-9]+\.[0-9]+)\./) {
+    $avx512vaes = ($1>=14.16);
+}
+
+if (!$avx512vaes && `$ENV{CC} -v 2>&1`
+    =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
+    my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
+    if ($1) {
+        # Apple conditions, they use a different version series, see
+        # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
+        # clang 7.0.0 is Apple clang 10.0.1
+        $avx512vaes = ($ver>=10.0001)
+    } else {
+        $avx512vaes = ($ver>=7.0);
+    }
+}
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+##################################################################
+
+$code=".text\n";
+
+if ($avx512vaes) {
+
+$code.=<<___;
+.extern  OPENSSL_ia32cap_P
+
+#################################################################
+# Signature:
+#
+# int ossl_aes_cfb128_vaes_eligible(void);
+#
+# Detects if the underlying hardware supports all the features
+# required to run the Intel AVX-512 implementations of AES-CFB128 algorithms.
+#
+# Returns: non zero if all the required features are detected, 0 otherwise
+#################################################################
+
+.globl   ossl_aes_cfb128_vaes_eligible
+.type    ossl_aes_cfb128_vaes_eligible,\@abi-omnipotent
+.balign  64
+
+ossl_aes_cfb128_vaes_eligible:
+.cfi_startproc
+    endbranch
+
+    mov OPENSSL_ia32cap_P+8(%rip),%ecx
+    xor %eax,%eax
+
+    # Check 3rd 32-bit word of OPENSSL_ia32cap_P for the feature bit(s):
+    # AVX512BW (bit 30) + AVX512DQ (bit 17) + AVX512F (bit 16)
+
+    and \$0x40030000,%ecx                 # mask is 1<<30|1<<17|1<<16
+    cmp \$0x40030000,%ecx
+    jne .Laes_cfb128_vaes_eligible_done
+
+    mov OPENSSL_ia32cap_P+12(%rip),%ecx
+
+    # Check 4th 32-bit word of OPENSSL_ia32cap_P for the feature bit(s):
+    # AVX512VAES (bit 9)
+
+    and \$0x200,%ecx                      # mask is 1<<9
+    cmp \$0x200,%ecx
+    cmove %ecx,%eax
+
+.Laes_cfb128_vaes_eligible_done:
+    ret
+.cfi_endproc
+.size   ossl_aes_cfb128_vaes_eligible, .-ossl_aes_cfb128_vaes_eligible
+___
+
+#################################################################
+#
+# AES subroutines for:
+# - preloading the AES key schedule into AVX registers
+# - single-block AES encryption used by CFB encryption and decryption
+# - multiple-block AES encryption used by CFB decryption
+#
+# The CFB mode only uses block cipher encryption.
+#
+# The AES encryption step is described in Section 5.1 Cipher() of
+# FIPS 197 https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197-upd1.pdf
+# and implemented with Intel(R) AES-NI and VAES instructions:
+#
+# - AESKEYGENASSIST for key expansion, elsewhere in aesni_set_encrypt_key()
+# - VPXORD for AES pre-whitening
+# - VAESENC for performing one AES encryption round
+# - VAESENCLAST for performing the last AES encryption round
+#
+# For more information please consult:
+# - the Intel(R) 64 and IA-32 Architectures Optimization Reference Manual,
+# Chapter 21: Cryptography & Finite Field Arithmetic Instructions
+# https://www.intel.com/content/www/us/en/developer/articles/technical/intel64-and-ia32-architectures-optimization.html
+# - the Intel(R) Advanced Encryption Standard (AES) New Instructions Set Whitepaper
+# https://www.intel.com/content/dam/doc/white-paper/advanced-encryption-standard-new-instructions-set-paper.pdf
+#
+#################################################################
+
+# expects the key schedule address in $key_original
+sub load_aes_key_schedule_1x() {
+$code.=<<___;
+    vmovdqu8   0($key_original),%xmm17  # schedule  0 whitening
+    vmovdqu8  16($key_original),%xmm18  #           1
+    vmovdqu8  32($key_original),%xmm19  #           2
+    vmovdqu8  48($key_original),%xmm20  #           3
+    vmovdqu8  64($key_original),%xmm21  #           4
+    vmovdqu8  80($key_original),%xmm22  #           5
+    vmovdqu8  96($key_original),%xmm23  #           6
+    vmovdqu8 112($key_original),%xmm24  #           7
+    vmovdqu8 128($key_original),%xmm25  #           8
+    vmovdqu8 144($key_original),%xmm26  #           9
+    vmovdqu8 160($key_original),%xmm27  #          10 last for AES-128
+    vmovdqu8 176($key_original),%xmm28  #          11
+    vmovdqu8 192($key_original),%xmm29  #          12 last for AES-192
+    vmovdqu8 208($key_original),%xmm30  #          13
+    vmovdqu8 224($key_original),%xmm31  #          14 last for AES-256
+
+    mov 240($key_original),$rounds      # load AES rounds
+                                        # 240 is the byte-offset of the rounds field in AES_KEY
+___
+}
+
+
+# expects the key schedule address in $key_original
+sub load_aes_key_schedule_4x() {
+$code.=<<___;
+    vbroadcasti32x4   0($key_original),%zmm17  # schedule  0 whitening
+    vbroadcasti32x4  16($key_original),%zmm18  #           1
+    vbroadcasti32x4  32($key_original),%zmm19  #           2
+    vbroadcasti32x4  48($key_original),%zmm20  #           3
+    vbroadcasti32x4  64($key_original),%zmm21  #           4
+    vbroadcasti32x4  80($key_original),%zmm22  #           5
+    vbroadcasti32x4  96($key_original),%zmm23  #           6
+    vbroadcasti32x4 112($key_original),%zmm24  #           7
+    vbroadcasti32x4 128($key_original),%zmm25  #           8
+    vbroadcasti32x4 144($key_original),%zmm26  #           9
+    vbroadcasti32x4 160($key_original),%zmm27  #          10 last for AES-128
+    vbroadcasti32x4 176($key_original),%zmm28  #          11
+    vbroadcasti32x4 192($key_original),%zmm29  #          12 last for AES-192
+    vbroadcasti32x4 208($key_original),%zmm30  #          13
+    vbroadcasti32x4 224($key_original),%zmm31  #          14 last for AES-256
+
+    mov 240($key_original),$rounds             # load AES rounds
+                                               # 240 is the byte-offset of the rounds field in AES_KEY
+___
+}
+
+# Performs AES encryption of 1 128-bit block
+# Expects iv in $temp, non-final AES rounds in $rounds and key schedule in xmm17..31
+sub vaes_encrypt_block_1x() {
+    my ($label_prefix)=@_;
+$code.=<<___;
+    vpxord  %xmm17,$temp,$temp     # AES pre-whitening
+    vaesenc %xmm18,$temp,$temp
+    vaesenc %xmm19,$temp,$temp
+    vaesenc %xmm20,$temp,$temp
+    vaesenc %xmm21,$temp,$temp
+    vaesenc %xmm22,$temp,$temp
+    vaesenc %xmm23,$temp,$temp
+    vaesenc %xmm24,$temp,$temp
+    vaesenc %xmm25,$temp,$temp
+    vaesenc %xmm26,$temp,$temp
+
+    cmp \$0x09,$rounds
+    ja ${label_prefix}_192_256
+
+    vaesenclast %xmm27,$temp,$temp # last AES-128 encryption round
+    jmp ${label_prefix}_end
+
+.balign 32
+${label_prefix}_192_256:
+
+    vaesenc %xmm27,$temp,$temp
+    vaesenc %xmm28,$temp,$temp
+
+    cmp \$0x0B,$rounds
+    ja ${label_prefix}_256
+
+    vaesenclast %xmm29,$temp,$temp # last AES-192 encryption round
+    jmp ${label_prefix}_end
+
+.balign 32
+${label_prefix}_256:
+
+    vaesenc %xmm29,$temp,$temp
+    vaesenc %xmm30,$temp,$temp
+    vaesenclast %xmm31,$temp,$temp # last AES-256 encryption round
+
+.balign 32
+${label_prefix}_end:
+___
+}
+
+
+# Performs parallel AES encryption of 4 128-bit blocks
+# Expects iv in $temp_4x, non-final AES rounds in $rounds and key schedule in zmm17..31
+sub vaes_encrypt_block_4x() {
+    my ($label_prefix)=@_;
+$code.=<<___;
+    vpxord  %zmm17,$temp_4x,$temp_4x     # AES pre-whitening
+    vaesenc %zmm18,$temp_4x,$temp_4x
+    vaesenc %zmm19,$temp_4x,$temp_4x
+    vaesenc %zmm20,$temp_4x,$temp_4x
+    vaesenc %zmm21,$temp_4x,$temp_4x
+    vaesenc %zmm22,$temp_4x,$temp_4x
+    vaesenc %zmm23,$temp_4x,$temp_4x
+    vaesenc %zmm24,$temp_4x,$temp_4x
+    vaesenc %zmm25,$temp_4x,$temp_4x
+    vaesenc %zmm26,$temp_4x,$temp_4x
+
+    cmp \$0x09,$rounds
+    ja ${label_prefix}_192_256
+
+    vaesenclast %zmm27,$temp_4x,$temp_4x # last AES-128 encryption round
+    jmp ${label_prefix}_end
+
+.balign 32
+${label_prefix}_192_256:
+
+    vaesenc %zmm27,$temp_4x,$temp_4x
+    vaesenc %zmm28,$temp_4x,$temp_4x
+
+    cmp \$0x0B,$rounds
+    ja ${label_prefix}_256
+
+    vaesenclast %zmm29,$temp_4x,$temp_4x # last AES-192 encryption round
+    jmp ${label_prefix}_end
+
+.balign 32
+${label_prefix}_256:
+
+    vaesenc %zmm29,$temp_4x,$temp_4x
+    vaesenc %zmm30,$temp_4x,$temp_4x
+    vaesenclast %zmm31,$temp_4x,$temp_4x # last AES-256 encryption round
+
+.balign 32
+${label_prefix}_end:
+___
+}
+
+
+# Performs parallel AES encryption of 16 128-bit blocks
+# Expects input in $temp_*x, non-final AES rounds in $rounds and key schedule in zmm17..31
+sub vaes_encrypt_block_16x() {
+    my ($label_prefix)=@_;
+$code.=<<___;
+    vpxord %zmm17,$temp_4x, $temp_4x       # parallel AES pre-whitening
+    vpxord %zmm17,$temp_8x, $temp_8x
+    vpxord %zmm17,$temp_12x,$temp_12x
+    vpxord %zmm17,$temp_16x,$temp_16x
+
+    vaesenc %zmm18,$temp_4x, $temp_4x
+    vaesenc %zmm18,$temp_8x, $temp_8x
+    vaesenc %zmm18,$temp_12x,$temp_12x
+    vaesenc %zmm18,$temp_16x,$temp_16x
+
+    vaesenc %zmm19,$temp_4x, $temp_4x
+    vaesenc %zmm19,$temp_8x, $temp_8x
+    vaesenc %zmm19,$temp_12x,$temp_12x
+    vaesenc %zmm19,$temp_16x,$temp_16x
+
+    vaesenc %zmm20,$temp_4x, $temp_4x
+    vaesenc %zmm20,$temp_8x, $temp_8x
+    vaesenc %zmm20,$temp_12x,$temp_12x
+    vaesenc %zmm20,$temp_16x,$temp_16x
+
+    vaesenc %zmm21,$temp_4x, $temp_4x
+    vaesenc %zmm21,$temp_8x, $temp_8x
+    vaesenc %zmm21,$temp_12x,$temp_12x
+    vaesenc %zmm21,$temp_16x,$temp_16x
+
+    vaesenc %zmm22,$temp_4x, $temp_4x
+    vaesenc %zmm22,$temp_8x, $temp_8x
+    vaesenc %zmm22,$temp_12x,$temp_12x
+    vaesenc %zmm22,$temp_16x,$temp_16x
+
+    vaesenc %zmm23,$temp_4x, $temp_4x
+    vaesenc %zmm23,$temp_8x, $temp_8x
+    vaesenc %zmm23,$temp_12x,$temp_12x
+    vaesenc %zmm23,$temp_16x,$temp_16x
+
+    vaesenc %zmm24,$temp_4x, $temp_4x
+    vaesenc %zmm24,$temp_8x, $temp_8x
+    vaesenc %zmm24,$temp_12x,$temp_12x
+    vaesenc %zmm24,$temp_16x,$temp_16x
+
+    vaesenc %zmm25,$temp_4x, $temp_4x
+    vaesenc %zmm25,$temp_8x, $temp_8x
+    vaesenc %zmm25,$temp_12x,$temp_12x
+    vaesenc %zmm25,$temp_16x,$temp_16x
+
+    vaesenc %zmm26,$temp_4x, $temp_4x
+    vaesenc %zmm26,$temp_8x, $temp_8x
+    vaesenc %zmm26,$temp_12x,$temp_12x
+    vaesenc %zmm26,$temp_16x,$temp_16x
+
+    cmp \$0x09,$rounds
+    ja ${label_prefix}_192_256
+
+    vaesenclast %zmm27,$temp_4x, $temp_4x  # last AES-128 encryption round
+    vaesenclast %zmm27,$temp_8x, $temp_8x
+    vaesenclast %zmm27,$temp_12x,$temp_12x
+    vaesenclast %zmm27,$temp_16x,$temp_16x
+    jmp ${label_prefix}_end
+
+.balign 32
+${label_prefix}_192_256:
+
+    vaesenc %zmm27,$temp_4x, $temp_4x
+    vaesenc %zmm27,$temp_8x, $temp_8x
+    vaesenc %zmm27,$temp_12x,$temp_12x
+    vaesenc %zmm27,$temp_16x,$temp_16x
+
+    vaesenc %zmm28,$temp_4x, $temp_4x
+    vaesenc %zmm28,$temp_8x, $temp_8x
+    vaesenc %zmm28,$temp_12x,$temp_12x
+    vaesenc %zmm28,$temp_16x,$temp_16x
+
+    cmp \$0x0B,$rounds
+    ja ${label_prefix}_256
+
+    vaesenclast %zmm29,$temp_4x, $temp_4x  # last AES-192 encryption round
+    vaesenclast %zmm29,$temp_8x, $temp_8x
+    vaesenclast %zmm29,$temp_12x,$temp_12x
+    vaesenclast %zmm29,$temp_16x,$temp_16x
+    jmp ${label_prefix}_end
+
+.balign 32
+${label_prefix}_256:
+
+    vaesenc %zmm29,$temp_4x, $temp_4x
+    vaesenc %zmm29,$temp_8x, $temp_8x
+    vaesenc %zmm29,$temp_12x,$temp_12x
+    vaesenc %zmm29,$temp_16x,$temp_16x
+
+    vaesenc %zmm30,$temp_4x, $temp_4x
+    vaesenc %zmm30,$temp_8x, $temp_8x
+    vaesenc %zmm30,$temp_12x,$temp_12x
+    vaesenc %zmm30,$temp_16x,$temp_16x
+
+    vaesenclast %zmm31,$temp_4x, $temp_4x  # last AES-256 encryption round
+    vaesenclast %zmm31,$temp_8x, $temp_8x
+    vaesenclast %zmm31,$temp_12x,$temp_12x
+    vaesenclast %zmm31,$temp_16x,$temp_16x
+
+.balign 32
+${label_prefix}_end:
+___
+}
+
+#################################################################
+# Signature:
+#
+# void ossl_aes_cfb128_vaes_enc(
+#     const unsigned char *in,
+#     unsigned char *out,
+#     size_t len,
+#     const AES_KEY *ks,
+#     const unsigned char ivec[16],
+#     /*in-out*/ ossl_ssize_t *num);
+#
+# Preconditions:
+# - all pointers are valid (not NULL...)
+# - AES key schedule and rounds in `ks` are precomputed
+#
+# Invariants:
+# - `*num` is between 0 and 15 (inclusive)
+#
+#################################################################
+#
+# The implementation follows closely the encryption half of CRYPTO_cfb128_encrypt:
+# - "pre" step: processes the last bytes of a partial block
+# - "mid" step: processes complete blocks
+# - "post" step: processes the first bytes of a partial block
+#
+# To obtain the next ciphertext block `cipher <N>` from
+# the plaintext block `plain <N>`, the previous ciphertext
+# block `cipher <N-1>` is required as input.
+#
+# The dependency on previous encryption outputs (ciphertexts)
+# makes CFB encryption inherently serial.
+#
+#                  +----+                       +----------+
+#                  | iv |       +---------------> cipher 0 |
+#                  +--+-+       |               +----------+
+#                     |         |                     |
+#                     |         |                     |
+#              +------v------+  |              +------v------+
+#              | AES encrypt |  |              | AES encrypt |
+#              |  with  key  |  |              |  with  key  |
+#              +------+------+  |              +------+------+
+#                     |         |                     |
+#                     |         |                     |
+#   +---------+    +--v--+      |   +---------+    +--v--+
+#   | plain 0 +----> XOR |      |   | plain 1 +----> XOR |
+#   +---------+    +--+--+      |   +---------+    +--+--+
+#                     |         |                     |
+#                     |         |                     |
+#               +-----v----+    |               +-----v----+
+#               | cipher 0 +----+               | cipher 1 |
+#               +----------+                    +----------+
+#
+#################################################################
+
+$code.=<<___;
+.globl   ossl_aes_cfb128_vaes_enc
+.type    ossl_aes_cfb128_vaes_enc,\@function,6
+.balign  64
+ossl_aes_cfb128_vaes_enc:
+.cfi_startproc
+    endbranch
+___
+
+$inp="%rdi";          # arg0
+$out="%rsi";          # arg1
+$len="%rdx";          # arg2
+
+$key_original="%rcx"; # arg3
+$key_backup="%r10";
+
+$ivp="%r8";           # arg4
+$nump="%r9";          # arg5
+
+$num="%r11";
+$left="%rcx";
+$mask="%rax";
+
+$rounds="%r11d";
+
+$temp="%xmm2";
+$plain="%xmm3";
+
+$code.=<<___;
+
+    mov ($nump),$num                 # $num is the current byte index in the first partial block
+                                     # $num belongs to 0..15; non-zero means a partial first block
+
+    test $len,$len                   # return early if $len==0, unlikely to occur
+    jz .Laes_cfb128_vaes_enc_done
+
+    test $num,$num                   # check if the first block is partial
+    jz .Laes_cfb128_enc_mid          # if not, jump to processing full blocks
+
+###########################################################
+# first partial block pre-processing
+###########################################################
+
+    mov $key_original,$key_backup    # make room for variable shl with cl
+
+    mov \$0x10,$left                 # first block is partial
+    sub $num,$left                   # calculate how many bytes $left to process in the block
+    cmp $len,$left                   #
+    cmova $len,$left                 # $left = min(16-$num,$len)
+
+    mov \$1,$mask                    # build a mask with the least significant $left bits set
+    shlq %cl,$mask                   # $left is left shift counter
+    dec $mask                        # $mask is 2^$left-1
+    kmovq $mask,%k1
+
+    mov $num,%rax                    # keep in-out $num in %al
+    add $left,%rax                   # advance $num
+    and \$0x0F,%al                   # wrap-around $num in a 16-byte block
+
+    leaq ($num,$ivp),%r11            # process $left iv bytes
+    vmovdqu8 (%r11),%xmm0
+    vmovdqu8 ($inp),%xmm1            # process $left input bytes
+    vpxor %xmm0,%xmm1,%xmm2          # CipherFeedBack XOR
+    vmovdqu8 %xmm2,($out){%k1}       # write $left output bytes
+    vmovdqu8 %xmm2,(%r11){%k1}       # blend $left output bytes into iv
+
+    add $left,$inp                   # advance pointers
+    add $left,$out
+    sub $left,$len
+    jz .Laes_cfb128_enc_zero_pre     # return early if no AES encryption required
+
+    mov $key_backup,$key_original    # restore "key_original" as arg3
+
+.Laes_cfb128_enc_mid:
+___
+
+    &load_aes_key_schedule_1x();
+
+$code.=<<___;
+###########################################################
+# inner full blocks processing
+###########################################################
+
+    vmovdqu ($ivp),$temp             # load iv
+
+    cmp \$0x10,$len                  # is there a full plaintext block left (128 bits) ?
+    jb .Laes_cfb128_enc_post
+
+.balign 32
+.Loop_aes_cfb128_enc_main:
+    sub \$0x10,$len
+
+    vmovdqu ($inp),$plain            # load plaintext block
+    lea 16($inp),$inp                # $inp points to next plaintext
+___
+
+    &vaes_encrypt_block_1x(".Laes_cfb128_enc_mid");
+
+$code.=<<___;
+
+    vpxor $plain,$temp,$temp         # CipherFeedBack XOR
+    cmp \$0x10,$len
+    vmovdqu $temp,($out)             # write ciphertext
+    lea 16($out),$out                # $out points to the next output block
+    jae .Loop_aes_cfb128_enc_main
+
+    xor %eax,%eax                    # reset num when processing full blocks
+
+    vmovdqu $temp,($ivp)             # latest ciphertext block is next encryption input
+
+.Laes_cfb128_enc_post:
+
+###########################################################
+# last partial block post-processing
+###########################################################
+
+    test $len,$len                   # check if the last block is partial
+    jz .Laes_cfb128_enc_zero_all
+___
+
+    &vaes_encrypt_block_1x(".Laes_cfb128_enc_post");
+
+$code.=<<___;
+
+    mov $len,%rax                    # num=$len
+
+    mov \$1,%r11                     # build a mask with the least significant $len bits set
+    mov %dl,%cl                      # $len is left shift counter less than 16
+    shlq %cl,%r11
+    dec %r11                         # mask is 2^$len-1
+    kmovq %r11,%k1
+
+    vmovdqu8 ($inp),%xmm1{%k1}{z}    # read $len input bytes, zero the rest to not impact XOR
+    vpxor $temp,%xmm1,%xmm0          # CipherFeedBack XOR
+    vmovdqu8 %xmm0,($out){%k1}       # write $len output bytes
+    vmovdqu8 %xmm0,($ivp)            # write chained/streaming iv
+
+    # clear registers
+
+.Laes_cfb128_enc_zero_all:
+    vpxord %xmm17,%xmm17,%xmm17      # clear the AES key schedule
+    vpxord %xmm18,%xmm18,%xmm18
+    vpxord %xmm19,%xmm19,%xmm19
+    vpxord %xmm20,%xmm20,%xmm20
+    vpxord %xmm21,%xmm21,%xmm21
+    vpxord %xmm22,%xmm22,%xmm22
+    vpxord %xmm23,%xmm23,%xmm23
+    vpxord %xmm24,%xmm24,%xmm24
+    vpxord %xmm25,%xmm25,%xmm25
+    vpxord %xmm26,%xmm26,%xmm26
+    vpxord %xmm27,%xmm27,%xmm27
+    vpxord %xmm28,%xmm28,%xmm28
+    vpxord %xmm29,%xmm29,%xmm29
+    vpxord %xmm30,%xmm30,%xmm30
+    vpxord %xmm31,%xmm31,%xmm31
+
+    vpxor %xmm3,%xmm3,%xmm3          # clear registers used during AES encryption
+
+.Laes_cfb128_enc_zero_pre:
+    vpxor %xmm0,%xmm0,%xmm0          # clear the rest of the registers
+    vpxor %xmm1,%xmm1,%xmm1
+    vpxor %xmm2,%xmm2,%xmm2
+
+    mov %rax,($nump)                 # num is in/out, update for future/chained calls
+
+    vzeroupper
+
+.Laes_cfb128_vaes_enc_done:
+    ret
+.cfi_endproc
+.size ossl_aes_cfb128_vaes_enc,.-ossl_aes_cfb128_vaes_enc
+___
+
+
+#################################################################
+# Signature:
+#
+# void ossl_aes_cfb128_vaes_dec(
+#     const unsigned char *in,
+#     unsigned char *out,
+#     size_t len,
+#     const AES_KEY *ks,
+#     const unsigned char ivec[16],
+#     /*in-out*/ ossl_ssize_t *num);
+#
+# Preconditions:
+# - all pointers are valid (not NULL...)
+# - AES key schedule and rounds in `ks` are precomputed
+#
+# Invariants:
+# - `*num` is between 0 and 15 (inclusive)
+#
+#################################################################
+#
+# The implementation follows closely the decryption half of CRYPTO_cfb128_encrypt:
+#
+# - "pre" step: processes the last bytes of a partial block
+# - "mid" step: processes complete blocks using an unrolled approach:
+#   - processes 16 blocks in parallel until fewer than 16 blocks remain
+#   - processes  4 blocks in parallel until fewer than  4 blocks remain
+#   - processes  1 block  in series   until none are left
+# - "post" step: processes the first bytes of a partial block
+#
+# To obtain the next plaintext block `plain <N>` from
+# its ciphertext block `cipher <N>`, the previous ciphertext
+# block `cipher <N-1>` is required as input.
+#
+# Since CFB decryption for the current block only depends on
+# iv and ciphertext blocks (already available as inputs)
+# and not on plaintext blocks, it can be efficiently parallelized.
+#
+#     +----+                    +----------+                +----------+                +----------+
+#     | iv |                    | cipher 0 |                | cipher 1 |                | cipher 2 |
+#     +--+-+                    +----+-----+                +----+-----+                +----+-----+
+#        |                           |                           |                           |
+#        |                           |                           |                           |
+# +------v------+             +------v------+             +------v------+             +------v------+
+# | AES encrypt |             | AES encrypt |             | AES encrypt |             | AES encrypt |
+# |  with  key  |             |  with  key  |             |  with  key  |             |  with  key  |
+# +------+------+             +------+------+             +------+------+             +------+------+
+#        |                           |                           |                           |
+#        |                           |                           |                           |
+#     +--v--+     +----------+    +--v--+     +----------+    +--v--+     +----------+    +--v--+     +----------+
+#     | XOR <-----+ cipher 0 |    | XOR <-----+ cipher 1 |    | XOR <-----+ cipher 2 |    | XOR <-----+ cipher 3 |
+#     +--+--+     +----------+    +--+--+     +----------+    +--+--+     +----------+    +--+--+     +----------+
+#        |                           |                           |                           |
+#        |                           |                           |                           |
+#   +----v----+                 +----v----+                 +----v----+                 +----v----+
+#   | plain 0 |                 | plain 1 |                 | plain 2 |                 | plain 3 |
+#   +---------+                 +---------+                 +---------+                 +---------+
+#
+# To produce N (4 in the diagram above) output/plaintext blocks we require as inputs:
+# - iv
+# - N ciphertext blocks
+# The N-th ciphertext block is not encrypted and becomes the next iv input.
+#
+#################################################################
+
+$code.=<<___;
+.globl   ossl_aes_cfb128_vaes_dec
+.type    ossl_aes_cfb128_vaes_dec,\@function,6
+.balign  64
+ossl_aes_cfb128_vaes_dec:
+.cfi_startproc
+    endbranch
+___
+
+$inp="%rdi";          # arg0
+$out="%rsi";          # arg1
+$len="%rdx";          # arg2
+
+$key_original="%rcx"; # arg3
+$key_backup="%r10";
+
+$ivp="%r8";           # arg4
+$nump="%r9";          # arg5
+
+$num="%r11";
+$left="%rcx";
+$mask="%rax";
+
+$rounds="%r11d";
+
+$temp="%xmm2";
+$temp_4x="%zmm2";
+$temp_8x="%zmm4";
+$temp_12x="%zmm0";
+$temp_16x="%zmm6";
+
+$cipher="%xmm3";
+$cipher_4x="%zmm3";
+$cipher_8x="%zmm5";
+$cipher_12x="%zmm1";
+$cipher_16x="%zmm16";
+
+$code.=<<___;
+
+    mov ($nump),$num                  # $num is the current byte index in the first partial block
+                                      # $num belongs to 0..15; non-zero means a partial first block
+
+    test $len,$len                    # return early if $len==0, unlikely to occur
+    jz .Laes_cfb128_vaes_dec_done
+___
+
+$code.=<<___ if($win64);
+    sub \$0x10,%rsp
+.cfi_adjust_cfa_offset 16
+    vmovdqu %xmm6,(%rsp)              # xmm6 needs to be maintained for Windows
+___
+
+$code.=<<___;
+    test $num,$num                    # check if the first block is partial
+    jz .Laes_cfb128_dec_mid           # if not, jump to processing full blocks
+
+###########################################################
+# first partial block pre-processing
+###########################################################
+
+    mov $key_original,$key_backup     # make room for variable shl with cl
+
+    mov \$0x10,$left                  # first block is partial
+    sub $num,$left                    # calculate how many bytes $left to process in the block
+    cmp $len,$left                    #
+    cmova $len,$left                  # $left = min(16-$num,$len)
+
+    mov \$1,$mask                     # build a mask with the least significant $left bits set
+    shlq %cl,$mask                    # $left is left shift counter
+    dec $mask                         # $mask is 2^$left-1
+    kmovq $mask,%k1
+
+    lea ($num,$left),%rax             # keep in-out num in %al, advance by $left
+    and \$0x0F,%al                    # wrap-around in a 16-byte block
+
+    leaq ($num,$ivp),%r11             # process $left iv bytes
+    vmovdqu8 (%r11),%xmm0
+    vmovdqu8 ($inp),%xmm1             # process $left input bytes
+    vpxor %xmm0,%xmm1,%xmm2           # CipherFeedBack XOR
+    vmovdqu8 %xmm2,($out){%k1}        # write $left output bytes
+    vmovdqu8 %xmm1,(%r11){%k1}        # blend $left input bytes into iv
+
+    add $left,$inp                    # advance pointers
+    add $left,$out
+    sub $left,$len
+    jz .Laes_cfb128_dec_zero_pre      # return early if no AES encryption required
+
+    mov $key_backup,$key_original     # restore "key_original" as arg3
+
+.Laes_cfb128_dec_mid:
+___
+
+    &load_aes_key_schedule_4x();
+
+$code.=<<___;
+###########################################################
+# inner full blocks processing
+###########################################################
+
+    # $temp_4x is "iv | iv | iv | iv"
+    vbroadcasti32x4 ($ivp),$temp_4x             # load iv
+
+    cmp \$0x100,$len                            # are there 16 ciphertext blocks left (2048 bits) ?
+    jb .Laes_cfb128_dec_check_4x
+
+###########################################################
+# decrypt groups of 16 128-bit blocks in parallel
+# behaves as 16x loop unroll
+###########################################################
+
+.balign 32
+.Loop_aes_cfb128_dec_mid_16x:
+    sub \$0x100,$len
+
+    # load 16 ciphertext blocks
+
+    # $cipher_4x  is "ciphertext  0 | ciphertext  1 | ciphertext  2 | ciphertext  3"
+    vmovdqu32 ($inp),$cipher_4x
+    # $cipher_8x  is "ciphertext  4 | ciphertext  5 | ciphertext  6 | ciphertext  7"
+    vmovdqu32 64($inp),$cipher_8x
+    # $cipher_12x is "ciphertext  8 | ciphertext  9 | ciphertext 10 | ciphertext 11"
+    vmovdqu32 128($inp),$cipher_12x
+    # $cipher_16x is "ciphertext 12 | ciphertext 13 | ciphertext 14 | ciphertext 15"
+    vmovdqu32 192($inp),$cipher_16x
+
+    # $temp_4x  is   "iv            | ciphertext  0 | ciphertext  1 | ciphertext  2"
+    valignq \$6,$temp_4x,$cipher_4x,$temp_4x
+    # $temp_8x  is   "ciphertext  3 | ciphertext  4 | ciphertext  5 | ciphertext  6"
+    valignq \$6,$cipher_4x,$cipher_8x,$temp_8x
+    # $temp_12x is   "ciphertext  7 | ciphertext  8 | ciphertext  9 | ciphertext 10"
+    valignq \$6,$cipher_8x,$cipher_12x,$temp_12x
+    # $temp_16x is   "ciphertext 11 | ciphertext 12 | ciphertext 13 | ciphertext 14"
+    valignq \$6,$cipher_12x,$cipher_16x,$temp_16x
+
+    lea 256($inp),$inp                          # $inp points to next ciphertext
+___
+
+    &vaes_encrypt_block_16x(".Laes_cfb128_dec_mid_16x");
+
+$code.=<<___;
+
+    vpxord $cipher_4x,$temp_4x,$temp_4x         # CipherFeedBack XOR of 16 blocks
+    vpxord $cipher_8x,$temp_8x,$temp_8x
+    vpxord $cipher_12x,$temp_12x,$temp_12x
+    vpxord $cipher_16x,$temp_16x,$temp_16x
+
+    cmp \$0x100,$len
+
+    vmovdqu32 $temp_4x,($out)                   # write 16 plaintext blocks
+    vmovdqu32 $temp_8x,64($out)
+    vmovdqu32 $temp_12x,128($out)
+    vmovdqu32 $temp_16x,192($out)
+
+    vmovdqu8 $cipher_16x,$temp_4x
+
+    lea 256($out),$out                          # $out points to the next output block
+
+    jae .Loop_aes_cfb128_dec_mid_16x
+
+    vextracti64x2 \$3,$cipher_16x,$temp         # latest ciphertext block is next decryption iv
+    vinserti32x4 \$3,$temp,$temp_4x,$temp_4x    # move ciphertext3 to positions 0 and 3 in preparation for next shuffle
+
+    xor %eax,%eax                               # reset $num when processing full blocks
+
+    vmovdqu $temp,($ivp)                        # latest plaintext block is next decryption iv
+
+.Laes_cfb128_dec_check_4x:
+    cmp \$0x40,$len                             # are there 4 ciphertext blocks left (512 bits) ?
+    jb .Laes_cfb128_dec_check_1x
+
+###########################################################
+# decrypt groups of 4 128-bit blocks in parallel
+# behaves as 4x loop unroll
+###########################################################
+
+# expects $temp_4x to contain "iv" in the 3rd (most significant) lane
+
+.balign 32
+.Loop_aes_cfb128_dec_mid_4x:
+    sub \$0x40,$len
+
+    # $cipher_4x is "ciphertext0 | ciphertext1 | ciphertext 2 | ciphertext 3"
+    vmovdqu32 ($inp),$cipher_4x                 # load 4 ciphertext blocks
+
+    # $temp_4x is   "iv          | ciphertext0 | ciphertext 1 | ciphertext 2"
+    valignq \$6,$temp_4x,$cipher_4x,$temp_4x
+
+    lea 64($inp),$inp                           # $inp points to next ciphertext
+___
+
+    &vaes_encrypt_block_4x(".Laes_cfb128_dec_mid_4x");
+
+$code.=<<___;
+    vpxord $cipher_4x,$temp_4x,$temp_4x         # CipherFeedBack XOR of 4 blocks
+    cmp \$0x40,$len
+    vmovdqu32 $temp_4x,($out)                   # write 4 plaintext blocks
+    vmovdqu8 $cipher_4x,$temp_4x
+    lea 64($out),$out                           # $out points to the next output block
+
+    jae .Loop_aes_cfb128_dec_mid_4x
+
+    vextracti64x2 \$3,$temp_4x,$temp            # latest ciphertext block is next decryption iv
+                                                # move ciphertext3 to position 0 in preparation for next step
+
+    xor %eax,%eax                               # reset $num when processing full blocks
+
+    vmovdqu $temp,($ivp)                        # latest plaintext block is next decryption iv
+
+.Laes_cfb128_dec_check_1x:
+    cmp \$0x10,$len                             # are there full ciphertext blocks left (128 bits) ?
+    jb .Laes_cfb128_dec_post
+
+###########################################################
+# decrypt the rest of full 128-bit blocks in series
+###########################################################
+
+# expects $temp to contain iv
+
+.balign 32
+.Loop_aes_cfb128_dec_mid_1x:
+    sub \$0x10,$len
+
+    vmovdqu ($inp),$cipher            # load ciphertext block
+    lea 16($inp),$inp                 # $inp points to next ciphertext
+___
+
+    &vaes_encrypt_block_1x(".Loop_aes_cfb128_dec_mid_1x_inner");
+
+$code.=<<___;
+    vpxor $cipher,$temp,$temp         # CipherFeedBack XOR
+    cmp \$0x10,$len
+    vmovdqu $temp,($out)              # write plaintext
+    vmovdqu8 $cipher,$temp
+    lea 16($out),$out                 # $out points to the next output block
+    jae .Loop_aes_cfb128_dec_mid_1x
+
+    xor %eax,%eax                     # reset $num when processing full blocks
+
+    vmovdqu $temp,($ivp)              # latest plaintext block is next decryption input
+
+.Laes_cfb128_dec_post:
+
+###########################################################
+# last partial block post-processing
+###########################################################
+
+    test $len,$len                     # check if the last block is partial
+    jz .Laes_cfb128_dec_zero_all
+___
+
+    &vaes_encrypt_block_1x(".Loop_aes_cfb128_dec_post");
+
+$code.=<<___;
+
+    mov $len,%rax                      # num=$len
+    mov \$1,%r11                       # build a mask with the least significant $len bits set
+    mov %dl,%cl                        # $len is left shift counter less than 16
+    shlq %cl,%r11
+    dec %r11                           # mask is 2^$len-1
+    kmovq %r11,%k1
+
+    vmovdqu8 ($inp),%xmm1{%k1}{z}      # read $len input bytes, zero the rest to not impact XOR
+    vpxor $temp,%xmm1,%xmm0            # CipherFeedBack XOR
+    vmovdqu8 %xmm0,($out){%k1}         # write $len output bytes
+    vpblendmb %xmm1,$temp,${temp}{%k1} # blend $len input bytes into iv
+
+    vmovdqu8 $temp,($ivp)              # write chained/streaming iv
+
+    # clear registers
+
+.Laes_cfb128_dec_zero_all:
+    vpxord %xmm17,%xmm17,%xmm17        # clear the AES key schedule
+    vpxord %xmm18,%xmm18,%xmm18        # zero the upper lanes of zmm registers
+    vpxord %xmm19,%xmm19,%xmm19
+    vpxord %xmm20,%xmm20,%xmm20
+    vpxord %xmm21,%xmm21,%xmm21
+    vpxord %xmm22,%xmm22,%xmm22
+    vpxord %xmm23,%xmm23,%xmm23
+    vpxord %xmm24,%xmm24,%xmm24
+    vpxord %xmm25,%xmm25,%xmm25
+    vpxord %xmm26,%xmm26,%xmm26
+    vpxord %xmm27,%xmm27,%xmm27
+    vpxord %xmm28,%xmm28,%xmm28
+    vpxord %xmm29,%xmm29,%xmm29
+    vpxord %xmm30,%xmm30,%xmm30
+    vpxord %xmm31,%xmm31,%xmm31
+
+    vpxord %xmm3,%xmm3,%xmm3           # clear registers used during AES encryption
+    vpxord %xmm4,%xmm4,%xmm4
+    vpxord %xmm5,%xmm5,%xmm5
+    vpxord %xmm6,%xmm6,%xmm6
+    vpxord %xmm16,%xmm16,%xmm16
+
+.Laes_cfb128_dec_zero_pre:
+
+    vpxord %xmm0,%xmm0,%xmm0           # clear the rest of the registers
+    vpxord %xmm1,%xmm1,%xmm1
+    vpxord %xmm2,%xmm2,%xmm2
+
+    vzeroupper
+___
+
+$code.=<<___ if($win64);
+    vmovdqu (%rsp),%xmm6               # xmm6 needs to be maintained for Windows
+    add \$16,%rsp
+.cfi_adjust_cfa_offset -16
+___
+
+$code.=<<___;
+    mov %rax,($nump)                   # num is in/out, update for future/chained calls
+
+.Laes_cfb128_vaes_dec_done:
+    ret
+.cfi_endproc
+.size ossl_aes_cfb128_vaes_dec,.-ossl_aes_cfb128_vaes_dec
+___
+
+} else {
+
+$code .= <<___;
+.globl     ossl_aes_cfb128_vaes_enc
+.globl     ossl_aes_cfb128_vaes_dec
+
+# Mock implementations of AES-CFB128 encryption/decryption
+# that always fail. Should not be executed under normal circumstances.
+
+ossl_aes_cfb128_vaes_enc:
+ossl_aes_cfb128_vaes_dec:
+    .byte 0x0f,0x0b                # Undefined Instruction in the Intel architecture
+                                   # Raises the Invalid Opcode exception
+    ret
+
+#################################################################
+# Signature:
+#
+# int ossl_aes_cfb128_vaes_eligible(void);
+#
+# Always returns 0 (not eligible), meaning that tooling does not support
+# the Intel AVX-512 extensions. Signals higher level code to fallback
+# to an alternative implementation.
+#################################################################
+
+.globl     ossl_aes_cfb128_vaes_eligible
+.type      ossl_aes_cfb128_vaes_eligible,\@abi-omnipotent
+ossl_aes_cfb128_vaes_eligible:
+    xor %eax,%eax
+    ret
+.size ossl_aes_cfb128_vaes_eligible, .-ossl_aes_cfb128_vaes_eligible
+___
+}
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/aes/build.info b/crypto/aes/build.info
index ed79316b006..cb3bb5f2deb 100644
--- a/crypto/aes/build.info
+++ b/crypto/aes/build.info
@@ -10,7 +10,7 @@ IF[{- !$disabled{asm} -}]
   $AESASM_x86_64=\
         aes-x86_64.s vpaes-x86_64.s bsaes-x86_64.s aesni-x86_64.s \
         aesni-sha1-x86_64.s aesni-sha256-x86_64.s aesni-mb-x86_64.s \
-        aesni-xts-avx512.s
+        aesni-xts-avx512.s aes-cfb-avx512.s
   $AESDEF_x86_64=AES_ASM VPAES_ASM BSAES_ASM
 
   $AESASM_ia64=aes_core.c aes_cbc.c aes-ia64.s
@@ -107,6 +107,7 @@ GENERATE[aes-x86_64.s]=asm/aes-x86_64.pl
 GENERATE[vpaes-x86_64.s]=asm/vpaes-x86_64.pl
 GENERATE[bsaes-x86_64.s]=asm/bsaes-x86_64.pl
 GENERATE[aesni-x86_64.s]=asm/aesni-x86_64.pl
+GENERATE[aes-cfb-avx512.s]=asm/aes-cfb-avx512.pl
 GENERATE[aesni-sha1-x86_64.s]=asm/aesni-sha1-x86_64.pl
 GENERATE[aesni-sha256-x86_64.s]=asm/aesni-sha256-x86_64.pl
 GENERATE[aesni-mb-x86_64.s]=asm/aesni-mb-x86_64.pl
diff --git a/crypto/perlasm/x86_64-xlate.pl b/crypto/perlasm/x86_64-xlate.pl
index 5de98e65393..6ee56b874b7 100755
--- a/crypto/perlasm/x86_64-xlate.pl
+++ b/crypto/perlasm/x86_64-xlate.pl
@@ -440,6 +440,7 @@ my %globals;
 	    ($self->{asterisk})				&& ($sz="q") ||
 	    ($mnemonic =~ /^v?mov([qd])$/)		&& ($sz=$1)  ||
 	    ($mnemonic =~ /^v?pinsr([qdwb])$/)		&& ($sz=$1)  ||
+	    ($mnemonic =~ /^vbroadcasti32x4$/)		&& ($sz="x") ||
 	    ($mnemonic =~ /^vpbroadcast([qdwb])$/)	&& ($sz=$1)  ||
 	    ($mnemonic =~ /^v(?!perm)[a-z]+[fi]128$/)	&& ($sz="x");
 
diff --git a/include/crypto/aes_platform.h b/include/crypto/aes_platform.h
index bdd51976a7a..91eccc4ba45 100644
--- a/include/crypto/aes_platform.h
+++ b/include/crypto/aes_platform.h
@@ -208,6 +208,14 @@ int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
 int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
                           AES_KEY *key);
 
+void ossl_aes_cfb128_vaes_enc(const unsigned char *in, unsigned char *out,
+                              size_t len, const AES_KEY *ks,
+                              const unsigned char ivec[16], ossl_ssize_t *num);
+void ossl_aes_cfb128_vaes_dec(const unsigned char *in, unsigned char *out,
+                              size_t len, const AES_KEY *ks,
+                              const unsigned char ivec[16], ossl_ssize_t *num);
+int ossl_aes_cfb128_vaes_eligible(void);
+
 void aesni_encrypt(const unsigned char *in, unsigned char *out,
                    const AES_KEY *key);
 void aesni_decrypt(const unsigned char *in, unsigned char *out,
diff --git a/providers/implementations/ciphers/cipher_aes_cfb_hw_aesni.inc b/providers/implementations/ciphers/cipher_aes_cfb_hw_aesni.inc
index fa0289a8b20..fae3f8a8409 100644
--- a/providers/implementations/ciphers/cipher_aes_cfb_hw_aesni.inc
+++ b/providers/implementations/ciphers/cipher_aes_cfb_hw_aesni.inc
@@ -8,14 +8,56 @@
  */
 
 /*-
- * AES-NI support for AES mode cfb.
+ * AES-NI and VAES support for AES CFB mode.
  * This file is included by cipher_aes_cfb_hw.c
  */
 
+#if (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64))
+    #define cipher_hw_vaes_cfb128 aes_cfb128_vaes_encdec_wrapper
+#else
+    #define cipher_hw_vaes_cfb128 ossl_cipher_hw_generic_cfb128
+    int ossl_aes_cfb128_vaes_eligible() {
+        return 0;
+    }
+#endif
+#define cipher_hw_vaes_cfb8    ossl_cipher_hw_generic_cfb8
+#define cipher_hw_vaes_cfb1    ossl_cipher_hw_generic_cfb1
+
 #define cipher_hw_aesni_cfb128 ossl_cipher_hw_generic_cfb128
 #define cipher_hw_aesni_cfb8   ossl_cipher_hw_generic_cfb8
 #define cipher_hw_aesni_cfb1   ossl_cipher_hw_generic_cfb1
 
+static int ossl_aes_cfb8_vaes_eligible(void) { return 0; }
+static int ossl_aes_cfb1_vaes_eligible(void) { return 0; }
+
+/* active in 64-bit builds when AES-NI, AVX512F, and VAES are detected */
+static int aes_cfb128_vaes_encdec_wrapper(
+    PROV_CIPHER_CTX* dat,
+    unsigned char *out,
+    const unsigned char *in,
+    size_t len)
+{
+    ossl_ssize_t num;
+
+    num = (ossl_ssize_t)dat->num;
+
+    if (num < 0) {
+        /* behavior from CRYPTO_cfb128_encrypt */
+        dat->num = -1;
+        return 1;
+    }
+
+    if (dat->enc)
+        ossl_aes_cfb128_vaes_enc(in, out, len, dat->ks, dat->iv, &num);
+    else
+        ossl_aes_cfb128_vaes_dec(in, out, len, dat->ks, dat->iv, &num);
+
+    dat->num = (int)num;
+
+    return 1;
+}
+ 
+/* generates AES round keys for AES-NI and VAES implementations */
 static int cipher_hw_aesni_initkey(PROV_CIPHER_CTX *dat,
                                    const unsigned char *key, size_t keylen)
 {
@@ -43,7 +85,15 @@ static const PROV_CIPHER_HW aesni_##mode = {                                   \
     cipher_hw_aesni_initkey,                                                   \
     cipher_hw_aesni_##mode,                                                    \
     cipher_hw_aes_copyctx                                                      \
+};                                                                             \
+static const PROV_CIPHER_HW vaes_##mode = {                                    \
+    cipher_hw_aesni_initkey,                                                   \
+    cipher_hw_vaes_##mode,                                                     \
+    cipher_hw_aes_copyctx                                                      \
 };
 #define PROV_CIPHER_HW_select(mode)                                            \
-if (AESNI_CAPABLE)                                                             \
-    return &aesni_##mode;
+if (AESNI_CAPABLE) {                                                           \
+    if (ossl_aes_##mode##_vaes_eligible())                                     \
+        return &vaes_##mode;                                                   \
+    return &aesni_##mode;                                                      \
+}