lib/accelerated/x86/XXX/ghash-x86_64.s \
lib/accelerated/x86/XXX/aesni-x86_64.s \
lib/accelerated/x86/XXX/aesni-x86.s \
- lib/accelerated/x86/XXX/e_padlock-x86_64.s \
- lib/accelerated/x86/XXX/e_padlock-x86.s \
lib/accelerated/x86/XXX/sha1-ssse3-x86.s \
lib/accelerated/x86/XXX/sha1-ssse3-x86_64.s \
lib/accelerated/x86/XXX/sha256-ssse3-x86.s \
lib/accelerated/x86/XXX/sha512-ssse3-x86.s \
lib/accelerated/x86/XXX/sha512-ssse3-x86_64.s \
+ lib/accelerated/x86/XXX/aesni-gcm-x86_64.s \
lib/accelerated/x86/XXX/aes-ssse3-x86.s \
lib/accelerated/x86/XXX/aes-ssse3-x86_64.s
XXX/sha256-ssse3-x86.s XXX/sha512-ssse3-x86.s XXX/aes-ssse3-x86.s
X86_64_FILES=XXX/aesni-x86_64.s XXX/cpuid-x86_64.s XXX/ghash-x86_64.s \
- XXX/sha1-ssse3-x86_64.s XXX/sha512-ssse3-x86_64.s XXX/aes-ssse3-x86_64.s
+ XXX/sha1-ssse3-x86_64.s XXX/sha512-ssse3-x86_64.s XXX/aes-ssse3-x86_64.s \
+ XXX/aesni-gcm-x86_64.s
X86_PADLOCK_FILES=XXX/e_padlock-x86.s
X86_64_PADLOCK_FILES=XXX/e_padlock-x86_64.s
# Appro's code
lib/accelerated/x86/elf/%.s: devel/perlasm/%.pl .submodule.stamp
cat $<.license > $@
- perl $< elf >> $@
+ CC=gcc perl $< elf >> $@
echo "" >> $@
echo ".section .note.GNU-stack,\"\",%progbits" >> $@
sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@
lib/accelerated/x86/coff/%-x86.s: devel/perlasm/%-x86.pl .submodule.stamp
cat $<.license > $@
- perl $< coff >> $@
+ CC=gcc perl $< coff >> $@
echo "" >> $@
sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@
lib/accelerated/x86/coff/%-x86_64.s: devel/perlasm/%-x86_64.pl .submodule.stamp
cat $<.license > $@
- perl $< mingw64 >> $@
+ CC=gcc perl $< mingw64 >> $@
echo "" >> $@
sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@
lib/accelerated/x86/macosx/%.s: devel/perlasm/%.pl .submodule.stamp
cat $<.license > $@
- perl $< macosx >> $@
+ CC=gcc perl $< macosx >> $@
echo "" >> $@
sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@
--- /dev/null
+../openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
\ No newline at end of file
--- /dev/null
+license.txt
\ No newline at end of file
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@item 0x2: Enable AES-NI
@item 0x4: Enable SSSE3
@item 0x8: Enable PCLMUL
+@item 0x10: Enable AVX
@item 0x100000: Enable VIA padlock
@item 0x200000: Enable VIA PHE
@item 0x400000: Enable VIA PHE SHA512
if ASM_X86_64
AM_CFLAGS += -DASM_X86_64 -DASM_X86
-libx86_la_SOURCES += aes-gcm-x86-pclmul.c
+libx86_la_SOURCES += aes-gcm-x86-pclmul.c aes-gcm-x86-pclmul-avx.c
if WINDOWS
libx86_la_SOURCES += $(X86_64_FILES_COFF)
--- /dev/null
+/*
+ * Copyright (C) 2011-2016 Free Software Foundation, Inc.
+ * Copyright (C) 2015-2016 Red Hat, Inc.
+ *
+ * Author: Nikos Mavrogiannopoulos
+ *
+ * This file is part of GnuTLS.
+ *
+ * The GnuTLS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>
+ *
+ */
+
+/*
+ * The following code is an implementation of the AES-128-GCM cipher
+ * using intel's AES instruction set.
+ */
+
+#include "errors.h"
+#include "gnutls_int.h"
+#include <gnutls/crypto.h>
+#include "errors.h"
+#include <aes-x86.h>
+#include <x86-common.h>
+#include <nettle/memxor.h>
+#include <byteswap.h>
+
+#define GCM_BLOCK_SIZE 16
+
+/* GCM mode with PCLMUL and AVX optimization */
+
+typedef struct {
+ uint64_t hi, lo;
+} u128;
+
+/* This is the gcm128 structure used in openssl. It
+ * is compatible with the included assembly code.
+ */
+struct gcm128_context {
+ union {
+ uint64_t u[2];
+ uint32_t d[4];
+ uint8_t c[16];
+ size_t t[16 / sizeof(size_t)];
+ } Yi, EKi, EK0, len, Xi, H;
+ u128 Htable[16];
+};
+
+struct aes_gcm_ctx {
+ AES_KEY expanded_key;
+ struct gcm128_context gcm;
+};
+
+void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]);
+void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, size_t len);
+void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]);
+
+static void aes_gcm_deinit(void *_ctx)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+
+ zeroize_temp_key(ctx, sizeof(*ctx));
+ gnutls_free(ctx);
+}
+
+static int
+aes_gcm_cipher_init(gnutls_cipher_algorithm_t algorithm, void **_ctx,
+ int enc)
+{
+ /* we use key size to distinguish */
+ if (algorithm != GNUTLS_CIPHER_AES_128_GCM &&
+ algorithm != GNUTLS_CIPHER_AES_256_GCM)
+ return GNUTLS_E_INVALID_REQUEST;
+
+ *_ctx = gnutls_calloc(1, sizeof(struct aes_gcm_ctx));
+ if (*_ctx == NULL) {
+ gnutls_assert();
+ return GNUTLS_E_MEMORY_ERROR;
+ }
+
+ return 0;
+}
+
+static int
+aes_gcm_cipher_setkey(void *_ctx, const void *userkey, size_t keysize)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+ int ret;
+
+ ret =
+ aesni_set_encrypt_key(userkey, keysize * 8,
+ ALIGN16(&ctx->expanded_key));
+ if (ret != 0)
+ return gnutls_assert_val(GNUTLS_E_ENCRYPTION_FAILED);
+
+ aesni_ecb_encrypt(ctx->gcm.H.c, ctx->gcm.H.c,
+ GCM_BLOCK_SIZE, ALIGN16(&ctx->expanded_key), 1);
+
+ ctx->gcm.H.u[0] = bswap_64(ctx->gcm.H.u[0]);
+ ctx->gcm.H.u[1] = bswap_64(ctx->gcm.H.u[1]);
+
+ gcm_init_avx(ctx->gcm.Htable, ctx->gcm.H.u);
+
+ return 0;
+}
+
+static int aes_gcm_setiv(void *_ctx, const void *iv, size_t iv_size)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+
+ if (iv_size != GCM_BLOCK_SIZE - 4)
+ return gnutls_assert_val(GNUTLS_E_INVALID_REQUEST);
+
+ memset(ctx->gcm.Xi.c, 0, sizeof(ctx->gcm.Xi.c));
+ memset(ctx->gcm.len.c, 0, sizeof(ctx->gcm.len.c));
+
+ memcpy(ctx->gcm.Yi.c, iv, GCM_BLOCK_SIZE - 4);
+ ctx->gcm.Yi.c[GCM_BLOCK_SIZE - 4] = 0;
+ ctx->gcm.Yi.c[GCM_BLOCK_SIZE - 3] = 0;
+ ctx->gcm.Yi.c[GCM_BLOCK_SIZE - 2] = 0;
+ ctx->gcm.Yi.c[GCM_BLOCK_SIZE - 1] = 1;
+
+ aesni_ecb_encrypt(ctx->gcm.Yi.c, ctx->gcm.EK0.c,
+ GCM_BLOCK_SIZE, ALIGN16(&ctx->expanded_key), 1);
+ ctx->gcm.Yi.c[GCM_BLOCK_SIZE - 1] = 2;
+ return 0;
+}
+
+static void
+gcm_ghash(struct aes_gcm_ctx *ctx, const uint8_t * src, size_t src_size)
+{
+ size_t rest = src_size % GCM_BLOCK_SIZE;
+ size_t aligned_size = src_size - rest;
+
+ if (aligned_size > 0)
+ gcm_ghash_avx(ctx->gcm.Xi.u, ctx->gcm.Htable, src,
+ aligned_size);
+
+ if (rest > 0) {
+ memxor(ctx->gcm.Xi.c, src + aligned_size, rest);
+ gcm_gmult_avx(ctx->gcm.Xi.u, ctx->gcm.Htable);
+ }
+}
+
+static inline void
+ctr_encrypt_last(struct aes_gcm_ctx *ctx, const uint8_t * src,
+ uint8_t * dst, size_t pos, size_t length)
+{
+ uint8_t tmp[GCM_BLOCK_SIZE];
+ uint8_t out[GCM_BLOCK_SIZE];
+
+ memcpy(tmp, &src[pos], length);
+ aesni_ctr32_encrypt_blocks(tmp, out, 1,
+ ALIGN16(&ctx->expanded_key),
+ ctx->gcm.Yi.c);
+
+ memcpy(&dst[pos], out, length);
+
+}
+
+static int
+aes_gcm_encrypt(void *_ctx, const void *src, size_t src_size,
+ void *dst, size_t length)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+ int blocks = src_size / GCM_BLOCK_SIZE;
+ int exp_blocks = blocks * GCM_BLOCK_SIZE;
+ int rest = src_size - (exp_blocks);
+ uint32_t counter;
+
+ if (blocks > 0) {
+ aesni_ctr32_encrypt_blocks(src, dst,
+ blocks,
+ ALIGN16(&ctx->expanded_key),
+ ctx->gcm.Yi.c);
+
+ counter = _gnutls_read_uint32(ctx->gcm.Yi.c + 12);
+ counter += blocks;
+ _gnutls_write_uint32(counter, ctx->gcm.Yi.c + 12);
+ }
+
+ if (rest > 0) /* last incomplete block */
+ ctr_encrypt_last(ctx, src, dst, exp_blocks, rest);
+
+ gcm_ghash(ctx, dst, src_size);
+ ctx->gcm.len.u[1] += src_size;
+
+ return 0;
+}
+
+static int
+aes_gcm_decrypt(void *_ctx, const void *src, size_t src_size,
+ void *dst, size_t dst_size)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+ int blocks = src_size / GCM_BLOCK_SIZE;
+ int exp_blocks = blocks * GCM_BLOCK_SIZE;
+ int rest = src_size - (exp_blocks);
+ uint32_t counter;
+
+ gcm_ghash(ctx, src, src_size);
+ ctx->gcm.len.u[1] += src_size;
+
+ if (blocks > 0) {
+ aesni_ctr32_encrypt_blocks(src, dst,
+ blocks,
+ ALIGN16(&ctx->expanded_key),
+ ctx->gcm.Yi.c);
+
+ counter = _gnutls_read_uint32(ctx->gcm.Yi.c + 12);
+ counter += blocks;
+ _gnutls_write_uint32(counter, ctx->gcm.Yi.c + 12);
+ }
+
+ if (rest > 0) /* last incomplete block */
+ ctr_encrypt_last(ctx, src, dst, exp_blocks, rest);
+
+ return 0;
+}
+
+static int aes_gcm_auth(void *_ctx, const void *src, size_t src_size)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+
+ gcm_ghash(ctx, src, src_size);
+ ctx->gcm.len.u[0] += src_size;
+
+ return 0;
+}
+
+
+static void aes_gcm_tag(void *_ctx, void *tag, size_t tagsize)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+ uint8_t buffer[GCM_BLOCK_SIZE];
+ uint64_t alen, clen;
+
+ alen = ctx->gcm.len.u[0] * 8;
+ clen = ctx->gcm.len.u[1] * 8;
+
+ _gnutls_write_uint64(alen, buffer);
+ _gnutls_write_uint64(clen, &buffer[8]);
+
+ gcm_ghash_avx(ctx->gcm.Xi.u, ctx->gcm.Htable, buffer,
+ GCM_BLOCK_SIZE);
+
+ ctx->gcm.Xi.u[0] ^= ctx->gcm.EK0.u[0];
+ ctx->gcm.Xi.u[1] ^= ctx->gcm.EK0.u[1];
+
+ memcpy(tag, ctx->gcm.Xi.c, MIN(GCM_BLOCK_SIZE, tagsize));
+}
+
+#ifdef ASM_X86_64
+/* requires AVX */
+static int
+aesni_gcm_aead_encrypt(void *_ctx,
+ const void *nonce, size_t nonce_size,
+ const void *auth, size_t auth_size,
+ size_t tag_size,
+ const void *plain, size_t plain_size,
+ void *encr, size_t encr_size)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+ size_t s = 0;
+
+ if (encr_size < plain_size + tag_size)
+ return gnutls_assert_val(GNUTLS_E_SHORT_MEMORY_BUFFER);
+
+ aes_gcm_setiv(ctx, nonce, nonce_size);
+ aes_gcm_auth(ctx, auth, auth_size);
+
+ if (plain_size >= 96) {
+ s = aesni_gcm_encrypt(plain, encr, plain_size, ALIGN16(&ctx->expanded_key),
+ ctx->gcm.Yi.c, ctx->gcm.Xi.u);
+ ctx->gcm.len.u[1] += s;
+ }
+
+ if ((plain_size-s) > 0)
+ aes_gcm_encrypt(ctx, ((uint8_t*)plain)+s, plain_size-s, ((uint8_t*)encr)+s, encr_size-s);
+
+ aes_gcm_tag(ctx, ((uint8_t*)encr) + plain_size, tag_size);
+
+ return 0;
+}
+
+static int
+aesni_gcm_aead_decrypt(void *_ctx,
+ const void *nonce, size_t nonce_size,
+ const void *auth, size_t auth_size,
+ size_t tag_size,
+ const void *encr, size_t encr_size,
+ void *plain, size_t plain_size)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+ uint8_t tag[MAX_HASH_SIZE];
+ size_t s = 0;
+
+ if (encr_size < tag_size)
+ return gnutls_assert_val(GNUTLS_E_DECRYPTION_FAILED);
+
+ aes_gcm_setiv(ctx, nonce, nonce_size);
+ aes_gcm_auth(ctx, auth, auth_size);
+
+ encr_size -= tag_size;
+
+ if (encr_size >= 96) {
+ s = aesni_gcm_decrypt(encr, plain, encr_size, ALIGN16(&ctx->expanded_key),
+ ctx->gcm.Yi.c, ctx->gcm.Xi.u);
+ ctx->gcm.len.u[1] += s;
+ }
+
+ if ((encr_size-s) > 0) {
+ aes_gcm_decrypt(ctx, ((uint8_t*)encr)+s, encr_size-s, ((uint8_t*)plain)+s, plain_size-s);
+ }
+
+ aes_gcm_tag(ctx, tag, tag_size);
+
+ if (gnutls_memcmp(((uint8_t*)encr)+encr_size, tag, tag_size) != 0)
+ return gnutls_assert_val(GNUTLS_E_DECRYPTION_FAILED);
+
+ return 0;
+}
+#else
+# define aesni_gcm_aead_decrypt aes_gcm_aead_decrypt
+# define aesni_gcm_aead_encrypt aes_gcm_aead_encrypt
+# include "aes-gcm-aead.h"
+#endif
+
+const gnutls_crypto_cipher_st _gnutls_aes_gcm_pclmul_avx = {
+ .init = aes_gcm_cipher_init,
+ .setkey = aes_gcm_cipher_setkey,
+ .setiv = aes_gcm_setiv,
+ .aead_encrypt = aesni_gcm_aead_encrypt,
+ .aead_decrypt = aesni_gcm_aead_decrypt,
+ .encrypt = aes_gcm_encrypt,
+ .decrypt = aes_gcm_decrypt,
+ .deinit = aes_gcm_deinit,
+ .tag = aes_gcm_tag,
+ .auth = aes_gcm_auth,
+};
const void *key,
const unsigned char *ivec);
+size_t aesni_gcm_encrypt(const void *inp, void *out, size_t len,
+ const AES_KEY *key, const unsigned char iv[16], uint64_t* Xi);
+
+size_t aesni_gcm_decrypt(const void *inp, void *out, size_t len,
+ const AES_KEY *key, const unsigned char iv[16], uint64_t* Xi);
+
int vpaes_set_encrypt_key(const unsigned char *userKey, int bits, AES_KEY *key);
int vpaes_set_decrypt_key(const unsigned char *userKey, int bits, AES_KEY *key);
void vpaes_cbc_encrypt(const unsigned char *in, unsigned char *out,
void vpaes_decrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key);
extern const gnutls_crypto_cipher_st _gnutls_aes_gcm_pclmul;
+extern const gnutls_crypto_cipher_st _gnutls_aes_gcm_pclmul_avx;
extern const gnutls_crypto_cipher_st _gnutls_aes_gcm_x86_aesni;
extern const gnutls_crypto_cipher_st _gnutls_aes_ccm_x86_aesni;
extern const gnutls_crypto_cipher_st _gnutls_aes_gcm_x86_ssse3;
addq $16,%r11
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
subq $1,%rax
pxor %xmm3,%xmm0
pand %xmm9,%xmm0
.byte 102,15,56,0,208
movdqa .Lk_dipt+16(%rip),%xmm0
- xorq $48,%r11
+ xorq $0x30,%r11
leaq .Lk_dsbd(%rip),%r10
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
pxor %xmm5,%xmm2
movdqa .Lk_mc_forward+48(%rip),%xmm5
pxor %xmm2,%xmm0
- call _vpaes_preheat
+ call _vpaes_preheat
movdqa .Lk_rcon(%rip),%xmm8
movdqu (%rdi),%xmm0
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
movdqu %xmm3,(%rdx)
- xorq $48,%r8
+ xorq $0x30,%r8
.Lschedule_go:
cmpl $192,%esi
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
jmp .Loop_schedule_128
.p2align 4
.Lschedule_192:
movdqu 8(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqa %xmm0,%xmm6
pxor %xmm4,%xmm4
movhlps %xmm4,%xmm6
.Loop_schedule_192:
call _vpaes_schedule_round
.byte 102,15,58,15,198,8
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
jmp .Loop_schedule_192
.p2align 4
.Lschedule_256:
movdqu 16(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movl $7,%esi
.Loop_schedule_256:
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
movdqa %xmm0,%xmm6
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
movdqa %xmm7,%xmm5
movdqa %xmm6,%xmm7
call _vpaes_schedule_low_round
.Lschedule_mangle_last_dec:
addq $-16,%rdx
pxor .Lk_s63(%rip),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqu %xmm0,(%rdx)
.def _vpaes_schedule_192_smear; .scl 3; .type 32; .endef
.p2align 4
_vpaes_schedule_192_smear:
- pshufd $128,%xmm6,%xmm1
- pshufd $254,%xmm7,%xmm0
+ pshufd $0x80,%xmm6,%xmm1
+ pshufd $0xFE,%xmm7,%xmm0
pxor %xmm1,%xmm6
pxor %xmm1,%xmm1
pxor %xmm0,%xmm6
pxor %xmm1,%xmm7
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
.byte 102,15,58,15,192,1
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
addq $-16,%r8
- andq $48,%r8
+ andq $0x30,%r8
movdqu %xmm3,(%rdx)
.byte 0xf3,0xc3
movl %eax,240(%rdx)
movl $0,%ecx
- movl $48,%r8d
+ movl $0x30,%r8d
call _vpaes_schedule_core
movaps 16(%rsp),%xmm6
movaps 32(%rsp),%xmm7
.Lk_dsbo:
.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
.p2align 6
leaq 16(%rax),%rsi
leaq 512(%r8),%rdi
movl $20,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
leaq 184(%rax),%rax
.Lin_prologue:
movq 40(%r9),%rdi
movq %r8,%rsi
movl $154,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
movq %r9,%rsi
xorq %rcx,%rcx
.LSEH_info_vpaes_set_encrypt_key:
.byte 9,0,0,0
.rva se_handler
-.rva .Lenc_key_body,.Lenc_key_epilogue
+.rva .Lenc_key_body,.Lenc_key_epilogue
.LSEH_info_vpaes_set_decrypt_key:
.byte 9,0,0,0
.rva se_handler
-.rva .Ldec_key_body,.Ldec_key_epilogue
+.rva .Ldec_key_body,.Ldec_key_epilogue
.LSEH_info_vpaes_encrypt:
.byte 9,0,0,0
.rva se_handler
-.rva .Lenc_body,.Lenc_epilogue
+.rva .Lenc_body,.Lenc_epilogue
.LSEH_info_vpaes_decrypt:
.byte 9,0,0,0
.rva se_handler
-.rva .Ldec_body,.Ldec_epilogue
+.rva .Ldec_body,.Ldec_epilogue
.LSEH_info_vpaes_cbc_encrypt:
.byte 9,0,0,0
.rva se_handler
-.rva .Lcbc_body,.Lcbc_epilogue
+.rva .Lcbc_body,.Lcbc_epilogue
--- /dev/null
+# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain copyright notices,
+# this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# * Neither the name of the Andy Polyakov nor the names of its
+# copyright holder and contributors may be used to endorse or
+# promote products derived from this software without specific
+# prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *** This file is auto-generated ***
+#
+.text
+
+.def _aesni_ctr32_ghash_6x; .scl 3; .type 32; .endef
+.p2align 5
+_aesni_ctr32_ghash_6x:
+ vmovdqu 32(%r11),%xmm2
+ subq $6,%rdx
+ vpxor %xmm4,%xmm4,%xmm4
+ vmovdqu 0-128(%rcx),%xmm15
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovdqu %xmm4,16+8(%rsp)
+ jmp .Loop6x
+
+.p2align 5
+.Loop6x:
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm15,%xmm10,%xmm10
+ vpxor %xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+ vmovdqu %xmm1,(%r8)
+ vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
+ vpxor %xmm15,%xmm12,%xmm12
+ vmovups 16-128(%rcx),%xmm2
+ vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
+ xorq %r12,%r12
+ cmpq %r14,%r15
+
+ vaesenc %xmm2,%xmm9,%xmm9
+ vmovdqu 48+8(%rsp),%xmm0
+ vpxor %xmm15,%xmm13,%xmm13
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
+ vaesenc %xmm2,%xmm10,%xmm10
+ vpxor %xmm15,%xmm14,%xmm14
+ setnc %r12b
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vmovdqu 16-32(%r9),%xmm3
+ negq %r12
+ vaesenc %xmm2,%xmm12,%xmm12
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
+ vpxor %xmm4,%xmm8,%xmm8
+ vaesenc %xmm2,%xmm13,%xmm13
+ vpxor %xmm5,%xmm1,%xmm4
+ andq $0x60,%r12
+ vmovups 32-128(%rcx),%xmm15
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
+ vaesenc %xmm2,%xmm14,%xmm14
+
+ vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
+ leaq (%r14,%r12,1),%r14
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
+ vmovdqu 64+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 88(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 80(%r14),%r12
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,32+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,40+8(%rsp)
+ vmovdqu 48-32(%r9),%xmm5
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 48-128(%rcx),%xmm15
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
+ vmovdqu 80+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqu 64-32(%r9),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 64-128(%rcx),%xmm15
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 72(%r14),%r13
+ vpxor %xmm5,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 64(%r14),%r12
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
+ vmovdqu 96+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,48+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,56+8(%rsp)
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 96-32(%r9),%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 80-128(%rcx),%xmm15
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 56(%r14),%r13
+ vpxor %xmm1,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
+ vpxor 112+8(%rsp),%xmm8,%xmm8
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 48(%r14),%r12
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,64+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,72+8(%rsp)
+ vpxor %xmm3,%xmm4,%xmm4
+ vmovdqu 112-32(%r9),%xmm3
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 96-128(%rcx),%xmm15
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 40(%r14),%r13
+ vpxor %xmm2,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 32(%r14),%r12
+ vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,80+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,88+8(%rsp)
+ vpxor %xmm5,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor %xmm1,%xmm6,%xmm6
+
+ vmovups 112-128(%rcx),%xmm15
+ vpslldq $8,%xmm6,%xmm5
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 16(%r11),%xmm3
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm8,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm5,%xmm4,%xmm4
+ movbeq 24(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 16(%r14),%r12
+ vpalignr $8,%xmm4,%xmm4,%xmm0
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ movq %r13,96+8(%rsp)
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r12,104+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ vmovups 128-128(%rcx),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 144-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm10,%xmm10
+ vpsrldq $8,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm11,%xmm11
+ vpxor %xmm6,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm12,%xmm12
+ vpxor %xmm0,%xmm4,%xmm4
+ movbeq 8(%r14),%r13
+ vaesenc %xmm1,%xmm13,%xmm13
+ movbeq 0(%r14),%r12
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 160-128(%rcx),%xmm1
+ cmpl $11,%ebp
+ jb .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 176-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 192-128(%rcx),%xmm1
+ je .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 208-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 224-128(%rcx),%xmm1
+ jmp .Lenc_tail
+
+.p2align 5
+.Lhandle_ctr32:
+ vmovdqu (%r11),%xmm0
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm15,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm15,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpshufb %xmm0,%xmm1,%xmm1
+ jmp .Lresume_ctr32
+
+.p2align 5
+.Lenc_tail:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vmovdqu %xmm7,16+8(%rsp)
+ vpalignr $8,%xmm4,%xmm4,%xmm8
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ vpxor 0(%rdi),%xmm1,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 16(%rdi),%xmm1,%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 32(%rdi),%xmm1,%xmm5
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 48(%rdi),%xmm1,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 64(%rdi),%xmm1,%xmm7
+ vpxor 80(%rdi),%xmm1,%xmm3
+ vmovdqu (%r8),%xmm1
+
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vmovdqu 32(%r11),%xmm2
+ vaesenclast %xmm0,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm1,%xmm0
+ movq %r13,112+8(%rsp)
+ leaq 96(%rdi),%rdi
+ vaesenclast %xmm5,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm0,%xmm5
+ movq %r12,120+8(%rsp)
+ leaq 96(%rsi),%rsi
+ vmovdqu 0-128(%rcx),%xmm15
+ vaesenclast %xmm6,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm5,%xmm6
+ vaesenclast %xmm7,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm6,%xmm7
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vpaddb %xmm2,%xmm7,%xmm3
+
+ addq $0x60,%r10
+ subq $0x6,%rdx
+ jc .L6x_done
+
+ vmovups %xmm9,-96(%rsi)
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovups %xmm10,-80(%rsi)
+ vmovdqa %xmm0,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vmovdqa %xmm5,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vmovdqa %xmm6,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vmovdqa %xmm7,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vmovdqa %xmm3,%xmm14
+ vmovdqu 32+8(%rsp),%xmm7
+ jmp .Loop6x
+
+.L6x_done:
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpxor %xmm4,%xmm8,%xmm8
+
+ .byte 0xf3,0xc3
+
+.globl aesni_gcm_decrypt
+.def aesni_gcm_decrypt; .scl 2; .type 32; .endef
+.p2align 5
+aesni_gcm_decrypt:
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rsp,%rax
+.LSEH_begin_aesni_gcm_decrypt:
+ movq %rcx,%rdi
+ movq %rdx,%rsi
+ movq %r8,%rdx
+ movq %r9,%rcx
+ movq 40(%rsp),%r8
+ movq 48(%rsp),%r9
+
+ xorq %r10,%r10
+ cmpq $0x60,%rdx
+ jb .Lgcm_dec_abort
+
+ leaq (%rsp),%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -168(%rsp),%rsp
+ movaps %xmm6,-216(%rax)
+ movaps %xmm7,-200(%rax)
+ movaps %xmm8,-184(%rax)
+ movaps %xmm9,-168(%rax)
+ movaps %xmm10,-152(%rax)
+ movaps %xmm11,-136(%rax)
+ movaps %xmm12,-120(%rax)
+ movaps %xmm13,-104(%rax)
+ movaps %xmm14,-88(%rax)
+ movaps %xmm15,-72(%rax)
+.Lgcm_dec_body:
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ vmovdqu (%r9),%xmm8
+ andq $-128,%rsp
+ vmovdqu (%r11),%xmm0
+ leaq 128(%rcx),%rcx
+ leaq 32+32(%r9),%r9
+ movl 240-128(%rcx),%ebp
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Ldec_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Ldec_no_key_aliasing
+ subq %r15,%rsp
+.Ldec_no_key_aliasing:
+
+ vmovdqu 80(%rdi),%xmm7
+ leaq (%rdi),%r14
+ vmovdqu 64(%rdi),%xmm4
+ leaq -192(%rdi,%rdx,1),%r15
+ vmovdqu 48(%rdi),%xmm5
+ shrq $4,%rdx
+ xorq %r10,%r10
+ vmovdqu 32(%rdi),%xmm6
+ vpshufb %xmm0,%xmm7,%xmm7
+ vmovdqu 16(%rdi),%xmm2
+ vpshufb %xmm0,%xmm4,%xmm4
+ vmovdqu (%rdi),%xmm3
+ vpshufb %xmm0,%xmm5,%xmm5
+ vmovdqu %xmm4,48(%rsp)
+ vpshufb %xmm0,%xmm6,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm2,%xmm2
+ vmovdqu %xmm6,80(%rsp)
+ vpshufb %xmm0,%xmm3,%xmm3
+ vmovdqu %xmm2,96(%rsp)
+ vmovdqu %xmm3,112(%rsp)
+
+ call _aesni_ctr32_ghash_6x
+
+ vmovups %xmm9,-96(%rsi)
+ vmovups %xmm10,-80(%rsi)
+ vmovups %xmm11,-64(%rsi)
+ vmovups %xmm12,-48(%rsi)
+ vmovups %xmm13,-32(%rsi)
+ vmovups %xmm14,-16(%rsi)
+
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movaps -216(%rax),%xmm6
+ movaps -200(%rax),%xmm7
+ movaps -184(%rax),%xmm8
+ movaps -168(%rax),%xmm9
+ movaps -152(%rax),%xmm10
+ movaps -136(%rax),%xmm11
+ movaps -120(%rax),%xmm12
+ movaps -104(%rax),%xmm13
+ movaps -88(%rax),%xmm14
+ movaps -72(%rax),%xmm15
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+.Lgcm_dec_abort:
+ movq %r10,%rax
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+ .byte 0xf3,0xc3
+.LSEH_end_aesni_gcm_decrypt:
+.def _aesni_ctr32_6x; .scl 3; .type 32; .endef
+.p2align 5
+_aesni_ctr32_6x:
+ vmovdqu 0-128(%rcx),%xmm4
+ vmovdqu 32(%r11),%xmm2
+ leaq -1(%rbp),%r13
+ vmovups 16-128(%rcx),%xmm15
+ leaq 32-128(%rcx),%r12
+ vpxor %xmm4,%xmm1,%xmm9
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32_2
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+
+.p2align 4
+.Loop_ctr32:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+ vmovups (%r12),%xmm15
+ leaq 16(%r12),%r12
+ decl %r13d
+ jnz .Loop_ctr32
+
+ vmovdqu (%r12),%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 0(%rdi),%xmm3,%xmm4
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor 16(%rdi),%xmm3,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 32(%rdi),%xmm3,%xmm6
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 48(%rdi),%xmm3,%xmm8
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 64(%rdi),%xmm3,%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 80(%rdi),%xmm3,%xmm3
+ leaq 96(%rdi),%rdi
+
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm5,%xmm10,%xmm10
+ vaesenclast %xmm6,%xmm11,%xmm11
+ vaesenclast %xmm8,%xmm12,%xmm12
+ vaesenclast %xmm2,%xmm13,%xmm13
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vmovups %xmm9,0(%rsi)
+ vmovups %xmm10,16(%rsi)
+ vmovups %xmm11,32(%rsi)
+ vmovups %xmm12,48(%rsi)
+ vmovups %xmm13,64(%rsi)
+ vmovups %xmm14,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ .byte 0xf3,0xc3
+.p2align 5
+.Lhandle_ctr32_2:
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+
+
+.globl aesni_gcm_encrypt
+.def aesni_gcm_encrypt; .scl 2; .type 32; .endef
+.p2align 5
+aesni_gcm_encrypt:
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rsp,%rax
+.LSEH_begin_aesni_gcm_encrypt:
+ movq %rcx,%rdi
+ movq %rdx,%rsi
+ movq %r8,%rdx
+ movq %r9,%rcx
+ movq 40(%rsp),%r8
+ movq 48(%rsp),%r9
+
+ xorq %r10,%r10
+ cmpq $288,%rdx
+ jb .Lgcm_enc_abort
+
+ leaq (%rsp),%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -168(%rsp),%rsp
+ movaps %xmm6,-216(%rax)
+ movaps %xmm7,-200(%rax)
+ movaps %xmm8,-184(%rax)
+ movaps %xmm9,-168(%rax)
+ movaps %xmm10,-152(%rax)
+ movaps %xmm11,-136(%rax)
+ movaps %xmm12,-120(%rax)
+ movaps %xmm13,-104(%rax)
+ movaps %xmm14,-88(%rax)
+ movaps %xmm15,-72(%rax)
+.Lgcm_enc_body:
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ leaq 128(%rcx),%rcx
+ vmovdqu (%r11),%xmm0
+ andq $-128,%rsp
+ movl 240-128(%rcx),%ebp
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Lenc_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Lenc_no_key_aliasing
+ subq %r15,%rsp
+.Lenc_no_key_aliasing:
+
+ leaq (%rsi),%r14
+ leaq -192(%rsi,%rdx,1),%r15
+ shrq $4,%rdx
+
+ call _aesni_ctr32_6x
+ vpshufb %xmm0,%xmm9,%xmm8
+ vpshufb %xmm0,%xmm10,%xmm2
+ vmovdqu %xmm8,112(%rsp)
+ vpshufb %xmm0,%xmm11,%xmm4
+ vmovdqu %xmm2,96(%rsp)
+ vpshufb %xmm0,%xmm12,%xmm5
+ vmovdqu %xmm4,80(%rsp)
+ vpshufb %xmm0,%xmm13,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm14,%xmm7
+ vmovdqu %xmm6,48(%rsp)
+
+ call _aesni_ctr32_6x
+
+ vmovdqu (%r9),%xmm8
+ leaq 32+32(%r9),%r9
+ subq $12,%rdx
+ movq $192,%r10
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ call _aesni_ctr32_ghash_6x
+ vmovdqu 32(%rsp),%xmm7
+ vmovdqu (%r11),%xmm0
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm7,%xmm7,%xmm1
+ vmovdqu 32-32(%r9),%xmm15
+ vmovups %xmm9,-96(%rsi)
+ vpshufb %xmm0,%xmm9,%xmm9
+ vpxor %xmm7,%xmm1,%xmm1
+ vmovups %xmm10,-80(%rsi)
+ vpshufb %xmm0,%xmm10,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vpshufb %xmm0,%xmm11,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vpshufb %xmm0,%xmm12,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vpshufb %xmm0,%xmm13,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vpshufb %xmm0,%xmm14,%xmm14
+ vmovdqu %xmm9,16(%rsp)
+ vmovdqu 48(%rsp),%xmm6
+ vmovdqu 16-32(%r9),%xmm0
+ vpunpckhqdq %xmm6,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
+ vpxor %xmm6,%xmm2,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+
+ vmovdqu 64(%rsp),%xmm9
+ vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm9,%xmm9,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
+ vpxor %xmm9,%xmm5,%xmm5
+ vpxor %xmm7,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vmovdqu 80(%rsp),%xmm1
+ vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm4,%xmm7,%xmm7
+ vpunpckhqdq %xmm1,%xmm1,%xmm4
+ vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpxor %xmm6,%xmm9,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 96(%rsp),%xmm2
+ vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm7,%xmm6,%xmm6
+ vpunpckhqdq %xmm2,%xmm2,%xmm7
+ vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpxor %xmm9,%xmm1,%xmm1
+ vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm5,%xmm4,%xmm4
+
+ vpxor 112(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
+ vmovdqu 112-32(%r9),%xmm0
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm1,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
+ vpxor %xmm4,%xmm7,%xmm4
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm1
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
+ vpxor %xmm14,%xmm1,%xmm1
+ vpxor %xmm5,%xmm6,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
+ vmovdqu 32-32(%r9),%xmm15
+ vpxor %xmm2,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm6
+
+ vmovdqu 16-32(%r9),%xmm0
+ vpxor %xmm5,%xmm7,%xmm9
+ vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
+ vpxor %xmm9,%xmm6,%xmm6
+ vpunpckhqdq %xmm13,%xmm13,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
+ vpxor %xmm13,%xmm2,%xmm2
+ vpslldq $8,%xmm6,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+ vpxor %xmm9,%xmm5,%xmm8
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm12,%xmm12,%xmm9
+ vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
+ vpxor %xmm12,%xmm9,%xmm9
+ vpxor %xmm14,%xmm13,%xmm13
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm11,%xmm11,%xmm1
+ vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
+ vpxor %xmm11,%xmm1,%xmm1
+ vpxor %xmm13,%xmm12,%xmm12
+ vxorps 16(%rsp),%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm9,%xmm9
+
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm10,%xmm10,%xmm2
+ vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
+ vpxor %xmm10,%xmm2,%xmm2
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpxor %xmm12,%xmm11,%xmm11
+ vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vxorps %xmm7,%xmm14,%xmm14
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
+ vmovdqu 112-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm11,%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
+ vpxor %xmm4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
+ vpxor %xmm10,%xmm7,%xmm7
+ vpxor %xmm2,%xmm6,%xmm6
+
+ vpxor %xmm5,%xmm7,%xmm4
+ vpxor %xmm4,%xmm6,%xmm6
+ vpslldq $8,%xmm6,%xmm1
+ vmovdqu 16(%r11),%xmm3
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm1,%xmm5,%xmm8
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm2,%xmm8,%xmm8
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm7,%xmm2,%xmm2
+ vpxor %xmm2,%xmm8,%xmm8
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movaps -216(%rax),%xmm6
+ movaps -200(%rax),%xmm7
+ movaps -184(%rax),%xmm8
+ movaps -168(%rax),%xmm9
+ movaps -152(%rax),%xmm10
+ movaps -136(%rax),%xmm11
+ movaps -120(%rax),%xmm12
+ movaps -104(%rax),%xmm13
+ movaps -88(%rax),%xmm14
+ movaps -72(%rax),%xmm15
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+.Lgcm_enc_abort:
+ movq %r10,%rax
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+ .byte 0xf3,0xc3
+.LSEH_end_aesni_gcm_encrypt:
+.p2align 6
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align 6
+
+.def gcm_se_handler; .scl 3; .type 32; .endef
+.p2align 4
+gcm_se_handler:
+ pushq %rsi
+ pushq %rdi
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushfq
+ subq $64,%rsp
+
+ movq 120(%r8),%rax
+ movq 248(%r8),%rbx
+
+ movq 8(%r9),%rsi
+ movq 56(%r9),%r11
+
+ movl 0(%r11),%r10d
+ leaq (%rsi,%r10,1),%r10
+ cmpq %r10,%rbx
+ jb .Lcommon_seh_tail
+
+ movq 152(%r8),%rax
+
+ movl 4(%r11),%r10d
+ leaq (%rsi,%r10,1),%r10
+ cmpq %r10,%rbx
+ jae .Lcommon_seh_tail
+
+ movq 120(%r8),%rax
+
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ movq %r15,240(%r8)
+ movq %r14,232(%r8)
+ movq %r13,224(%r8)
+ movq %r12,216(%r8)
+ movq %rbp,160(%r8)
+ movq %rbx,144(%r8)
+
+ leaq -216(%rax),%rsi
+ leaq 512(%r8),%rdi
+ movl $20,%ecx
+.long 0xa548f3fc
+
+.Lcommon_seh_tail:
+ movq 8(%rax),%rdi
+ movq 16(%rax),%rsi
+ movq %rax,152(%r8)
+ movq %rsi,168(%r8)
+ movq %rdi,176(%r8)
+
+ movq 40(%r9),%rdi
+ movq %r8,%rsi
+ movl $154,%ecx
+.long 0xa548f3fc
+
+ movq %r9,%rsi
+ xorq %rcx,%rcx
+ movq 8(%rsi),%rdx
+ movq 0(%rsi),%r8
+ movq 16(%rsi),%r9
+ movq 40(%rsi),%r10
+ leaq 56(%rsi),%r11
+ leaq 24(%rsi),%r12
+ movq %r10,32(%rsp)
+ movq %r11,40(%rsp)
+ movq %r12,48(%rsp)
+ movq %rcx,56(%rsp)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ movl $1,%eax
+ addq $64,%rsp
+ popfq
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ popq %rdi
+ popq %rsi
+ .byte 0xf3,0xc3
+
+
+.section .pdata
+.p2align 2
+.rva .LSEH_begin_aesni_gcm_decrypt
+.rva .LSEH_end_aesni_gcm_decrypt
+.rva .LSEH_gcm_dec_info
+
+.rva .LSEH_begin_aesni_gcm_encrypt
+.rva .LSEH_end_aesni_gcm_encrypt
+.rva .LSEH_gcm_enc_info
+.section .xdata
+.p2align 3
+.LSEH_gcm_dec_info:
+.byte 9,0,0,0
+.rva gcm_se_handler
+.rva .Lgcm_dec_body,.Lgcm_dec_abort
+.LSEH_gcm_enc_info:
+.byte 9,0,0,0
+.rva gcm_se_handler
+.rva .Lgcm_enc_body,.Lgcm_enc_abort
+
leal 16(%edx),%edx
jnz .L000enc1_loop_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.globl _aesni_decrypt
.def _aesni_decrypt; .scl 2; .type 32; .endef
leal 16(%edx),%edx
jnz .L001dec1_loop_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
+ ret
+.def __aesni_encrypt2; .scl 3; .type 32; .endef
+.align 16
+__aesni_encrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L002enc2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L002enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ ret
+.def __aesni_decrypt2; .scl 3; .type 32; .endef
+.align 16
+__aesni_decrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L003dec2_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L003dec2_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
ret
.def __aesni_encrypt3; .scl 3; .type 32; .endef
.align 16
__aesni_encrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-.L002enc3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L004enc3_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
- movups (%edx),%xmm0
- jnz .L002enc3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L004enc3_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.align 16
__aesni_decrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-.L003dec3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L005dec3_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
- movups (%edx),%xmm0
- jnz .L003dec3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L005dec3_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
__aesni_encrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-.L004enc4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+.L006enc4_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%edx),%xmm0
- jnz .L004enc4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L006enc4_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
__aesni_decrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-.L005dec4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+.L007dec4_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%edx),%xmm0
- jnz .L005dec4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L007dec4_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.align 16
__aesni_encrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
-.byte 102,15,56,220,241
- movups (%edx),%xmm0
-.byte 102,15,56,220,249
- jmp .L_aesni_encrypt6_enter
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp .L008_aesni_encrypt6_inner
.align 16
-.L006enc6_loop:
+.L009enc6_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
+.L008_aesni_encrypt6_inner:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.align 16
.L_aesni_encrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%edx),%xmm0
- jnz .L006enc6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L009enc6_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.align 16
__aesni_decrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
-.byte 102,15,56,222,241
- movups (%edx),%xmm0
-.byte 102,15,56,222,249
- jmp .L_aesni_decrypt6_enter
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp .L010_aesni_decrypt6_inner
.align 16
-.L007dec6_loop:
+.L011dec6_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
+.L010_aesni_decrypt6_inner:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.align 16
.L_aesni_decrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%edx),%xmm0
- jnz .L007dec6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L011dec6_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
movl 32(%esp),%edx
movl 36(%esp),%ebx
andl $-16,%eax
- jz .L008ecb_ret
+ jz .L012ecb_ret
movl 240(%edx),%ecx
testl %ebx,%ebx
- jz .L009ecb_decrypt
+ jz .L013ecb_decrypt
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L010ecb_enc_tail
+ jb .L014ecb_enc_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L011ecb_enc_loop6_enter
+ jmp .L015ecb_enc_loop6_enter
.align 16
-.L012ecb_enc_loop6:
+.L016ecb_enc_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L011ecb_enc_loop6_enter:
+.L015ecb_enc_loop6_enter:
call __aesni_encrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L012ecb_enc_loop6
+ jnc .L016ecb_enc_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L008ecb_ret
-.L010ecb_enc_tail:
+ jz .L012ecb_ret
+.L014ecb_enc_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L013ecb_enc_one
+ jb .L017ecb_enc_one
movups 16(%esi),%xmm3
- je .L014ecb_enc_two
+ je .L018ecb_enc_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L015ecb_enc_three
+ jb .L019ecb_enc_three
movups 48(%esi),%xmm5
- je .L016ecb_enc_four
+ je .L020ecb_enc_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_encrypt6
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L013ecb_enc_one:
+.L017ecb_enc_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L017enc1_loop_3:
+.L021enc1_loop_3:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L017enc1_loop_3
+ jnz .L021enc1_loop_3
.byte 102,15,56,221,209
movups %xmm2,(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L014ecb_enc_two:
- xorps %xmm4,%xmm4
- call __aesni_encrypt3
+.L018ecb_enc_two:
+ call __aesni_encrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L015ecb_enc_three:
+.L019ecb_enc_three:
call __aesni_encrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L016ecb_enc_four:
+.L020ecb_enc_four:
call __aesni_encrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L009ecb_decrypt:
+.L013ecb_decrypt:
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L018ecb_dec_tail
+ jb .L022ecb_dec_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L019ecb_dec_loop6_enter
+ jmp .L023ecb_dec_loop6_enter
.align 16
-.L020ecb_dec_loop6:
+.L024ecb_dec_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L019ecb_dec_loop6_enter:
+.L023ecb_dec_loop6_enter:
call __aesni_decrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L020ecb_dec_loop6
+ jnc .L024ecb_dec_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L008ecb_ret
-.L018ecb_dec_tail:
+ jz .L012ecb_ret
+.L022ecb_dec_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L021ecb_dec_one
+ jb .L025ecb_dec_one
movups 16(%esi),%xmm3
- je .L022ecb_dec_two
+ je .L026ecb_dec_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L023ecb_dec_three
+ jb .L027ecb_dec_three
movups 48(%esi),%xmm5
- je .L024ecb_dec_four
+ je .L028ecb_dec_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_decrypt6
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L021ecb_dec_one:
+.L025ecb_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L025dec1_loop_4:
+.L029dec1_loop_4:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L025dec1_loop_4
+ jnz .L029dec1_loop_4
.byte 102,15,56,223,209
movups %xmm2,(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L022ecb_dec_two:
- xorps %xmm4,%xmm4
- call __aesni_decrypt3
+.L026ecb_dec_two:
+ call __aesni_decrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L023ecb_dec_three:
+.L027ecb_dec_three:
call __aesni_decrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L024ecb_dec_four:
+.L028ecb_dec_four:
call __aesni_decrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L008ecb_ret:
+.L012ecb_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
movl %ebp,20(%esp)
movl %ebp,24(%esp)
movl %ebp,28(%esp)
- shrl $1,%ecx
+ shll $4,%ecx
+ movl $16,%ebx
leal (%edx),%ebp
movdqa (%esp),%xmm5
movdqa %xmm7,%xmm2
- movl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ subl %ecx,%ebx
.byte 102,15,56,0,253
-.L026ccm64_enc_outer:
+.L030ccm64_enc_outer:
movups (%ebp),%xmm0
movl %ebx,%ecx
movups (%esi),%xmm6
xorps %xmm0,%xmm2
movups 16(%ebp),%xmm1
xorps %xmm6,%xmm0
- leal 32(%ebp),%edx
xorps %xmm0,%xmm3
- movups (%edx),%xmm0
-.L027ccm64_enc2_loop:
+ movups 32(%ebp),%xmm0
+.L031ccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz .L027ccm64_enc2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L031ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
paddq 16(%esp),%xmm7
+ decl %eax
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decl %eax
leal 16(%esi),%esi
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
movups %xmm6,(%edi)
- leal 16(%edi),%edi
.byte 102,15,56,0,213
- jnz .L026ccm64_enc_outer
+ leal 16(%edi),%edi
+ jnz .L030ccm64_enc_outer
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L028enc1_loop_5:
+.L032enc1_loop_5:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L028enc1_loop_5
+ jnz .L032enc1_loop_5
.byte 102,15,56,221,209
+ shll $4,%ebx
+ movl $16,%ecx
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
leal 16(%esi),%esi
- jmp .L029ccm64_dec_outer
+ subl %ebx,%ecx
+ leal 32(%ebp,%ebx,1),%edx
+ movl %ecx,%ebx
+ jmp .L033ccm64_dec_outer
.align 16
-.L029ccm64_dec_outer:
+.L033ccm64_dec_outer:
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
- movl %ebx,%ecx
movups %xmm6,(%edi)
leal 16(%edi),%edi
.byte 102,15,56,0,213
subl $1,%eax
- jz .L030ccm64_dec_break
+ jz .L034ccm64_dec_break
movups (%ebp),%xmm0
- shrl $1,%ecx
+ movl %ebx,%ecx
movups 16(%ebp),%xmm1
xorps %xmm0,%xmm6
- leal 32(%ebp),%edx
xorps %xmm0,%xmm2
xorps %xmm6,%xmm3
- movups (%edx),%xmm0
-.L031ccm64_dec2_loop:
+ movups 32(%ebp),%xmm0
+.L035ccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz .L031ccm64_dec2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L035ccm64_dec2_loop
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leal 16(%esi),%esi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- jmp .L029ccm64_dec_outer
+ leal 16(%esi),%esi
+ jmp .L033ccm64_dec_outer
.align 16
-.L030ccm64_dec_break:
+.L034ccm64_dec_break:
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movups (%edx),%xmm0
movups 16(%edx),%xmm1
xorps %xmm0,%xmm6
leal 32(%edx),%edx
xorps %xmm6,%xmm3
-.L032enc1_loop_6:
+.L036enc1_loop_6:
.byte 102,15,56,220,217
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L032enc1_loop_6
+ jnz .L036enc1_loop_6
.byte 102,15,56,221,217
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
andl $-16,%esp
movl %ebp,80(%esp)
cmpl $1,%eax
- je .L033ctr32_one_shortcut
+ je .L037ctr32_one_shortcut
movdqu (%ebx),%xmm7
movl $202182159,(%esp)
movl $134810123,4(%esp)
.byte 102,15,58,34,253,3
movl 240(%edx),%ecx
bswap %ebx
- pxor %xmm1,%xmm1
pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movdqa (%esp),%xmm2
-.byte 102,15,58,34,203,0
+.byte 102,15,58,34,195,0
leal 3(%ebx),%ebp
-.byte 102,15,58,34,197,0
+.byte 102,15,58,34,205,0
incl %ebx
-.byte 102,15,58,34,203,1
+.byte 102,15,58,34,195,1
incl %ebp
-.byte 102,15,58,34,197,1
+.byte 102,15,58,34,205,1
incl %ebx
-.byte 102,15,58,34,203,2
+.byte 102,15,58,34,195,2
incl %ebp
-.byte 102,15,58,34,197,2
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
- movdqa %xmm0,64(%esp)
+.byte 102,15,58,34,205,2
+ movdqa %xmm0,48(%esp)
.byte 102,15,56,0,194
- pshufd $192,%xmm1,%xmm2
- pshufd $128,%xmm1,%xmm3
+ movdqu (%edx),%xmm6
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
+ pshufd $192,%xmm0,%xmm2
+ pshufd $128,%xmm0,%xmm3
cmpl $6,%eax
- jb .L034ctr32_tail
+ jb .L038ctr32_tail
+ pxor %xmm6,%xmm7
+ shll $4,%ecx
+ movl $16,%ebx
movdqa %xmm7,32(%esp)
- shrl $1,%ecx
movl %edx,%ebp
- movl %ecx,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
subl $6,%eax
- jmp .L035ctr32_loop6
-.align 16
-.L035ctr32_loop6:
- pshufd $64,%xmm1,%xmm4
- movdqa 32(%esp),%xmm1
- pshufd $192,%xmm0,%xmm5
- por %xmm1,%xmm2
- pshufd $128,%xmm0,%xmm6
- por %xmm1,%xmm3
- pshufd $64,%xmm0,%xmm7
- por %xmm1,%xmm4
- por %xmm1,%xmm5
- por %xmm1,%xmm6
- por %xmm1,%xmm7
- movups (%ebp),%xmm0
- movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
- decl %ecx
+ jmp .L039ctr32_loop6
+.align 16
+.L039ctr32_loop6:
+ pshufd $64,%xmm0,%xmm4
+ movdqa 32(%esp),%xmm0
+ pshufd $192,%xmm1,%xmm5
pxor %xmm0,%xmm2
+ pshufd $128,%xmm1,%xmm6
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
+ pshufd $64,%xmm1,%xmm7
+ movups 16(%ebp),%xmm1
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
+.byte 102,15,56,220,209
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
pxor %xmm0,%xmm7
+.byte 102,15,56,220,217
+ movups 32(%ebp),%xmm0
+ movl %ebx,%ecx
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call .L_aesni_encrypt6_enter
movups (%esi),%xmm1
movups %xmm2,(%edi)
movdqa 16(%esp),%xmm0
xorps %xmm1,%xmm4
- movdqa 48(%esp),%xmm1
+ movdqa 64(%esp),%xmm1
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
paddd %xmm0,%xmm1
- paddd 64(%esp),%xmm0
+ paddd 48(%esp),%xmm0
movdqa (%esp),%xmm2
movups 48(%esi),%xmm3
movups 64(%esi),%xmm4
xorps %xmm3,%xmm5
movups 80(%esi),%xmm3
leal 96(%esi),%esi
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
+ movdqa %xmm0,48(%esp)
+.byte 102,15,56,0,194
xorps %xmm4,%xmm6
movups %xmm5,48(%edi)
xorps %xmm3,%xmm7
- movdqa %xmm0,64(%esp)
-.byte 102,15,56,0,194
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
movups %xmm6,64(%edi)
- pshufd $192,%xmm1,%xmm2
+ pshufd $192,%xmm0,%xmm2
movups %xmm7,80(%edi)
leal 96(%edi),%edi
- movl %ebx,%ecx
- pshufd $128,%xmm1,%xmm3
+ pshufd $128,%xmm0,%xmm3
subl $6,%eax
- jnc .L035ctr32_loop6
+ jnc .L039ctr32_loop6
addl $6,%eax
- jz .L036ctr32_ret
+ jz .L040ctr32_ret
+ movdqu (%ebp),%xmm7
movl %ebp,%edx
- leal 1(,%ecx,2),%ecx
- movdqa 32(%esp),%xmm7
-.L034ctr32_tail:
+ pxor 32(%esp),%xmm7
+ movl 240(%ebp),%ecx
+.L038ctr32_tail:
por %xmm7,%xmm2
cmpl $2,%eax
- jb .L037ctr32_one
- pshufd $64,%xmm1,%xmm4
+ jb .L041ctr32_one
+ pshufd $64,%xmm0,%xmm4
por %xmm7,%xmm3
- je .L038ctr32_two
- pshufd $192,%xmm0,%xmm5
+ je .L042ctr32_two
+ pshufd $192,%xmm1,%xmm5
por %xmm7,%xmm4
cmpl $4,%eax
- jb .L039ctr32_three
- pshufd $128,%xmm0,%xmm6
+ jb .L043ctr32_three
+ pshufd $128,%xmm1,%xmm6
por %xmm7,%xmm5
- je .L040ctr32_four
+ je .L044ctr32_four
por %xmm7,%xmm6
call __aesni_encrypt6
movups (%esi),%xmm1
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L036ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L033ctr32_one_shortcut:
+.L037ctr32_one_shortcut:
movups (%ebx),%xmm2
movl 240(%edx),%ecx
-.L037ctr32_one:
+.L041ctr32_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L041enc1_loop_7:
+.L045enc1_loop_7:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L041enc1_loop_7
+ jnz .L045enc1_loop_7
.byte 102,15,56,221,209
movups (%esi),%xmm6
xorps %xmm2,%xmm6
movups %xmm6,(%edi)
- jmp .L036ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L038ctr32_two:
- call __aesni_encrypt3
+.L042ctr32_two:
+ call __aesni_encrypt2
movups (%esi),%xmm5
movups 16(%esi),%xmm6
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L036ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L039ctr32_three:
+.L043ctr32_three:
call __aesni_encrypt3
movups (%esi),%xmm5
movups 16(%esi),%xmm6
xorps %xmm7,%xmm4
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L036ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L040ctr32_four:
+.L044ctr32_four:
call __aesni_encrypt4
movups (%esi),%xmm6
movups 16(%esi),%xmm7
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L036ctr32_ret:
+.L040ctr32_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
movl 80(%esp),%esp
popl %edi
popl %esi
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L042enc1_loop_8:
+.L046enc1_loop_8:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L042enc1_loop_8
+ jnz .L046enc1_loop_8
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
movl %edx,%ebp
movl %ecx,%ebx
subl $96,%eax
- jc .L043xts_enc_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp .L044xts_enc_loop6
+ jc .L047xts_enc_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp .L048xts_enc_loop6
.align 16
-.L044xts_enc_loop6:
+.L048xts_enc_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,220,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call .L_aesni_encrypt6_enter
movdqa 80(%esp),%xmm1
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L044xts_enc_loop6
- leal 1(,%ecx,2),%ecx
+ jnc .L048xts_enc_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L043xts_enc_short:
+.L047xts_enc_short:
addl $96,%eax
- jz .L045xts_enc_done6x
+ jz .L049xts_enc_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L046xts_enc_one
+ jb .L050xts_enc_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L047xts_enc_two
+ je .L051xts_enc_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L048xts_enc_three
+ jb .L052xts_enc_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L049xts_enc_four
+ je .L053xts_enc_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L046xts_enc_one:
+.L050xts_enc_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L051enc1_loop_9:
+.L055enc1_loop_9:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L051enc1_loop_9
+ jnz .L055enc1_loop_9
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L047xts_enc_two:
+.L051xts_enc_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- xorps %xmm4,%xmm4
- call __aesni_encrypt3
+ call __aesni_encrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L048xts_enc_three:
+.L052xts_enc_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L049xts_enc_four:
+.L053xts_enc_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L045xts_enc_done6x:
+.L049xts_enc_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L052xts_enc_ret
+ jz .L056xts_enc_ret
movdqa %xmm1,%xmm5
movl %eax,112(%esp)
- jmp .L053xts_enc_steal
+ jmp .L057xts_enc_steal
.align 16
-.L050xts_enc_done:
+.L054xts_enc_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L052xts_enc_ret
+ jz .L056xts_enc_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm5
paddq %xmm1,%xmm1
pand 96(%esp),%xmm5
pxor %xmm1,%xmm5
-.L053xts_enc_steal:
+.L057xts_enc_steal:
movzbl (%esi),%ecx
movzbl -16(%edi),%edx
leal 1(%esi),%esi
movb %dl,(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L053xts_enc_steal
+ jnz .L057xts_enc_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L054enc1_loop_10:
+.L058enc1_loop_10:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L054enc1_loop_10
+ jnz .L058enc1_loop_10
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,-16(%edi)
-.L052xts_enc_ret:
+.L056xts_enc_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L055enc1_loop_11:
+.L059enc1_loop_11:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L055enc1_loop_11
+ jnz .L059enc1_loop_11
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
pcmpgtd %xmm1,%xmm0
andl $-16,%eax
subl $96,%eax
- jc .L056xts_dec_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp .L057xts_dec_loop6
+ jc .L060xts_dec_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp .L061xts_dec_loop6
.align 16
-.L057xts_dec_loop6:
+.L061xts_dec_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,222,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
.byte 102,15,56,222,241
- movups (%edx),%xmm0
.byte 102,15,56,222,249
call .L_aesni_decrypt6_enter
movdqa 80(%esp),%xmm1
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L057xts_dec_loop6
- leal 1(,%ecx,2),%ecx
+ jnc .L061xts_dec_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L056xts_dec_short:
+.L060xts_dec_short:
addl $96,%eax
- jz .L058xts_dec_done6x
+ jz .L062xts_dec_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L059xts_dec_one
+ jb .L063xts_dec_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L060xts_dec_two
+ je .L064xts_dec_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L061xts_dec_three
+ jb .L065xts_dec_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L062xts_dec_four
+ je .L066xts_dec_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L059xts_dec_one:
+.L063xts_dec_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L064dec1_loop_12:
+.L068dec1_loop_12:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L064dec1_loop_12
+ jnz .L068dec1_loop_12
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L060xts_dec_two:
+.L064xts_dec_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- call __aesni_decrypt3
+ call __aesni_decrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L061xts_dec_three:
+.L065xts_dec_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L062xts_dec_four:
+.L066xts_dec_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L058xts_dec_done6x:
+.L062xts_dec_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L065xts_dec_ret
+ jz .L069xts_dec_ret
movl %eax,112(%esp)
- jmp .L066xts_dec_only_one_more
+ jmp .L070xts_dec_only_one_more
.align 16
-.L063xts_dec_done:
+.L067xts_dec_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L065xts_dec_ret
+ jz .L069xts_dec_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm2
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
-.L066xts_dec_only_one_more:
+.L070xts_dec_only_one_more:
pshufd $19,%xmm0,%xmm5
movdqa %xmm1,%xmm6
paddq %xmm1,%xmm1
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L067dec1_loop_13:
+.L071dec1_loop_13:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L067dec1_loop_13
+ jnz .L071dec1_loop_13
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
-.L068xts_dec_steal:
+.L072xts_dec_steal:
movzbl 16(%esi),%ecx
movzbl (%edi),%edx
leal 1(%esi),%esi
movb %dl,16(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L068xts_dec_steal
+ jnz .L072xts_dec_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L069dec1_loop_14:
+.L073dec1_loop_14:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L069dec1_loop_14
+ jnz .L073dec1_loop_14
.byte 102,15,56,223,209
xorps %xmm6,%xmm2
movups %xmm2,(%edi)
-.L065xts_dec_ret:
+.L069xts_dec_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
movl 32(%esp),%edx
movl 36(%esp),%ebp
testl %eax,%eax
- jz .L070cbc_abort
+ jz .L074cbc_abort
cmpl $0,40(%esp)
xchgl %esp,%ebx
movups (%ebp),%xmm7
movl %edx,%ebp
movl %ebx,16(%esp)
movl %ecx,%ebx
- je .L071cbc_decrypt
+ je .L075cbc_decrypt
movaps %xmm7,%xmm2
cmpl $16,%eax
- jb .L072cbc_enc_tail
+ jb .L076cbc_enc_tail
subl $16,%eax
- jmp .L073cbc_enc_loop
+ jmp .L077cbc_enc_loop
.align 16
-.L073cbc_enc_loop:
+.L077cbc_enc_loop:
movups (%esi),%xmm7
leal 16(%esi),%esi
movups (%edx),%xmm0
xorps %xmm0,%xmm7
leal 32(%edx),%edx
xorps %xmm7,%xmm2
-.L074enc1_loop_15:
+.L078enc1_loop_15:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L074enc1_loop_15
+ jnz .L078enc1_loop_15
.byte 102,15,56,221,209
movl %ebx,%ecx
movl %ebp,%edx
movups %xmm2,(%edi)
leal 16(%edi),%edi
subl $16,%eax
- jnc .L073cbc_enc_loop
+ jnc .L077cbc_enc_loop
addl $16,%eax
- jnz .L072cbc_enc_tail
+ jnz .L076cbc_enc_tail
movaps %xmm2,%xmm7
- jmp .L075cbc_ret
-.L072cbc_enc_tail:
+ pxor %xmm2,%xmm2
+ jmp .L079cbc_ret
+.L076cbc_enc_tail:
movl %eax,%ecx
.long 2767451785
movl $16,%ecx
movl %ebx,%ecx
movl %edi,%esi
movl %ebp,%edx
- jmp .L073cbc_enc_loop
+ jmp .L077cbc_enc_loop
.align 16
-.L071cbc_decrypt:
+.L075cbc_decrypt:
cmpl $80,%eax
- jbe .L076cbc_dec_tail
+ jbe .L080cbc_dec_tail
movaps %xmm7,(%esp)
subl $80,%eax
- jmp .L077cbc_dec_loop6_enter
+ jmp .L081cbc_dec_loop6_enter
.align 16
-.L078cbc_dec_loop6:
+.L082cbc_dec_loop6:
movaps %xmm0,(%esp)
movups %xmm7,(%edi)
leal 16(%edi),%edi
-.L077cbc_dec_loop6_enter:
+.L081cbc_dec_loop6_enter:
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
movups %xmm6,64(%edi)
leal 80(%edi),%edi
subl $96,%eax
- ja .L078cbc_dec_loop6
+ ja .L082cbc_dec_loop6
movaps %xmm7,%xmm2
movaps %xmm0,%xmm7
addl $80,%eax
- jle .L079cbc_dec_tail_collected
+ jle .L083cbc_dec_clear_tail_collected
movups %xmm2,(%edi)
leal 16(%edi),%edi
-.L076cbc_dec_tail:
+.L080cbc_dec_tail:
movups (%esi),%xmm2
movaps %xmm2,%xmm6
cmpl $16,%eax
- jbe .L080cbc_dec_one
+ jbe .L084cbc_dec_one
movups 16(%esi),%xmm3
movaps %xmm3,%xmm5
cmpl $32,%eax
- jbe .L081cbc_dec_two
+ jbe .L085cbc_dec_two
movups 32(%esi),%xmm4
cmpl $48,%eax
- jbe .L082cbc_dec_three
+ jbe .L086cbc_dec_three
movups 48(%esi),%xmm5
cmpl $64,%eax
- jbe .L083cbc_dec_four
+ jbe .L087cbc_dec_four
movups 64(%esi),%xmm6
movaps %xmm7,(%esp)
movups (%esi),%xmm2
xorps %xmm0,%xmm6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%edi)
+ pxor %xmm5,%xmm5
leal 64(%edi),%edi
movaps %xmm6,%xmm2
+ pxor %xmm6,%xmm6
subl $80,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L080cbc_dec_one:
+.L084cbc_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L084dec1_loop_16:
+.L089dec1_loop_16:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L084dec1_loop_16
+ jnz .L089dec1_loop_16
.byte 102,15,56,223,209
xorps %xmm7,%xmm2
movaps %xmm6,%xmm7
subl $16,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L081cbc_dec_two:
- xorps %xmm4,%xmm4
- call __aesni_decrypt3
+.L085cbc_dec_two:
+ call __aesni_decrypt2
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movaps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leal 16(%edi),%edi
movaps %xmm5,%xmm7
subl $32,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L082cbc_dec_three:
+.L086cbc_dec_three:
call __aesni_decrypt3
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
xorps %xmm5,%xmm4
movups %xmm2,(%edi)
movaps %xmm4,%xmm2
+ pxor %xmm4,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
leal 32(%edi),%edi
movups 32(%esi),%xmm7
subl $48,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L083cbc_dec_four:
+.L087cbc_dec_four:
call __aesni_decrypt4
movups 16(%esi),%xmm1
movups 32(%esi),%xmm0
movups %xmm2,(%edi)
xorps %xmm1,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
leal 48(%edi),%edi
movaps %xmm5,%xmm2
+ pxor %xmm5,%xmm5
subl $64,%eax
-.L079cbc_dec_tail_collected:
+ jmp .L088cbc_dec_tail_collected
+.align 16
+.L083cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+.L088cbc_dec_tail_collected:
andl $15,%eax
- jnz .L085cbc_dec_tail_partial
+ jnz .L090cbc_dec_tail_partial
movups %xmm2,(%edi)
- jmp .L075cbc_ret
+ pxor %xmm0,%xmm0
+ jmp .L079cbc_ret
.align 16
-.L085cbc_dec_tail_partial:
+.L090cbc_dec_tail_partial:
movaps %xmm2,(%esp)
+ pxor %xmm0,%xmm0
movl $16,%ecx
movl %esp,%esi
subl %eax,%ecx
.long 2767451785
-.L075cbc_ret:
+ movdqa %xmm2,(%esp)
+.L079cbc_ret:
movl 16(%esp),%esp
movl 36(%esp),%ebp
+ pxor %xmm2,%xmm2
+ pxor %xmm1,%xmm1
movups %xmm7,(%ebp)
-.L070cbc_abort:
+ pxor %xmm7,%xmm7
+.L074cbc_abort:
popl %edi
popl %esi
popl %ebx
.def __aesni_set_encrypt_key; .scl 3; .type 32; .endef
.align 16
__aesni_set_encrypt_key:
+ pushl %ebp
+ pushl %ebx
testl %eax,%eax
- jz .L086bad_pointer
+ jz .L091bad_pointer
testl %edx,%edx
- jz .L086bad_pointer
+ jz .L091bad_pointer
+ call .L092pic
+.L092pic:
+ popl %ebx
+ leal .Lkey_const-.L092pic(%ebx),%ebx
+ leal __gnutls_x86_cpuid_s,%ebp
movups (%eax),%xmm0
xorps %xmm4,%xmm4
+ movl 4(%ebp),%ebp
leal 16(%edx),%edx
+ andl $268437504,%ebp
cmpl $256,%ecx
- je .L08714rounds
+ je .L09314rounds
cmpl $192,%ecx
- je .L08812rounds
+ je .L09412rounds
cmpl $128,%ecx
- jne .L089bad_keybits
+ jne .L095bad_keybits
.align 16
-.L09010rounds:
+.L09610rounds:
+ cmpl $268435456,%ebp
+ je .L09710rounds_alt
movl $9,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,200,1
- call .L091key_128_cold
+ call .L098key_128_cold
.byte 102,15,58,223,200,2
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,4
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,8
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,16
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,32
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,64
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,128
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,27
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,54
- call .L092key_128
+ call .L099key_128
movups %xmm0,(%edx)
movl %ecx,80(%edx)
- xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L092key_128:
+.L099key_128:
movups %xmm0,(%edx)
leal 16(%edx),%edx
-.L091key_128_cold:
+.L098key_128_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
xorps %xmm1,%xmm0
ret
.align 16
-.L08812rounds:
+.L09710rounds_alt:
+ movdqa (%ebx),%xmm5
+ movl $8,%ecx
+ movdqa 32(%ebx),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,-16(%edx)
+.L101loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leal 16(%edx),%edx
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%edx)
+ movdqa %xmm0,%xmm2
+ decl %ecx
+ jnz .L101loop_key128
+ movdqa 48(%ebx),%xmm4
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%edx)
+ movl $9,%ecx
+ movl %ecx,96(%edx)
+ jmp .L100good_key
+.align 16
+.L09412rounds:
movq 16(%eax),%xmm2
+ cmpl $268435456,%ebp
+ je .L10212rounds_alt
movl $11,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,202,1
- call .L093key_192a_cold
+ call .L103key_192a_cold
.byte 102,15,58,223,202,2
- call .L094key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,4
- call .L095key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,8
- call .L094key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,16
- call .L095key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,32
- call .L094key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,64
- call .L095key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,128
- call .L094key_192b
+ call .L104key_192b
movups %xmm0,(%edx)
movl %ecx,48(%edx)
- xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L095key_192a:
+.L105key_192a:
movups %xmm0,(%edx)
leal 16(%edx),%edx
.align 16
-.L093key_192a_cold:
+.L103key_192a_cold:
movaps %xmm2,%xmm5
-.L096key_192b_warm:
+.L106key_192b_warm:
shufps $16,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
pxor %xmm3,%xmm2
ret
.align 16
-.L094key_192b:
+.L104key_192b:
movaps %xmm0,%xmm3
shufps $68,%xmm0,%xmm5
movups %xmm5,(%edx)
shufps $78,%xmm2,%xmm3
movups %xmm3,16(%edx)
leal 32(%edx),%edx
- jmp .L096key_192b_warm
+ jmp .L106key_192b_warm
+.align 16
+.L10212rounds_alt:
+ movdqa 16(%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $8,%ecx
+ movdqu %xmm0,-16(%edx)
+.L107loop_key192:
+ movq %xmm2,(%edx)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leal 24(%edx),%edx
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%edx)
+ decl %ecx
+ jnz .L107loop_key192
+ movl $11,%ecx
+ movl %ecx,32(%edx)
+ jmp .L100good_key
.align 16
-.L08714rounds:
+.L09314rounds:
movups 16(%eax),%xmm2
- movl $13,%ecx
leal 16(%edx),%edx
+ cmpl $268435456,%ebp
+ je .L10814rounds_alt
+ movl $13,%ecx
movups %xmm0,-32(%edx)
movups %xmm2,-16(%edx)
.byte 102,15,58,223,202,1
- call .L097key_256a_cold
+ call .L109key_256a_cold
.byte 102,15,58,223,200,1
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,2
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,2
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,4
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,4
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,8
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,8
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,16
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,16
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,32
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,32
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,64
- call .L099key_256a
+ call .L111key_256a
movups %xmm0,(%edx)
movl %ecx,16(%edx)
xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L099key_256a:
+.L111key_256a:
movups %xmm2,(%edx)
leal 16(%edx),%edx
-.L097key_256a_cold:
+.L109key_256a_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
xorps %xmm1,%xmm0
ret
.align 16
-.L098key_256b:
+.L110key_256b:
movups %xmm0,(%edx)
leal 16(%edx),%edx
shufps $16,%xmm2,%xmm4
shufps $170,%xmm1,%xmm1
xorps %xmm1,%xmm2
ret
+.align 16
+.L10814rounds_alt:
+ movdqa (%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $7,%ecx
+ movdqu %xmm0,-32(%edx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,-16(%edx)
+.L112loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ decl %ecx
+ jz .L113done_key256
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%edx)
+ leal 32(%edx),%edx
+ movdqa %xmm2,%xmm1
+ jmp .L112loop_key256
+.L113done_key256:
+ movl $13,%ecx
+ movl %ecx,16(%edx)
+.L100good_key:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ xorl %eax,%eax
+ popl %ebx
+ popl %ebp
+ ret
.align 4
-.L086bad_pointer:
+.L091bad_pointer:
movl $-1,%eax
+ popl %ebx
+ popl %ebp
ret
.align 4
-.L089bad_keybits:
+.L095bad_keybits:
+ pxor %xmm0,%xmm0
movl $-2,%eax
+ popl %ebx
+ popl %ebp
ret
.globl _aesni_set_encrypt_key
.def _aesni_set_encrypt_key; .scl 2; .type 32; .endef
movl 12(%esp),%edx
shll $4,%ecx
testl %eax,%eax
- jnz .L100dec_key_ret
+ jnz .L114dec_key_ret
leal 16(%edx,%ecx,1),%eax
movups (%edx),%xmm0
movups (%eax),%xmm1
movups %xmm1,(%edx)
leal 16(%edx),%edx
leal -16(%eax),%eax
-.L101dec_key_inverse:
+.L115dec_key_inverse:
movups (%edx),%xmm0
movups (%eax),%xmm1
.byte 102,15,56,219,192
movups %xmm0,16(%eax)
movups %xmm1,-16(%edx)
cmpl %edx,%eax
- ja .L101dec_key_inverse
+ ja .L115dec_key_inverse
movups (%edx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
xorl %eax,%eax
-.L100dec_key_ret:
+.L114dec_key_ret:
ret
+.align 64
+.Lkey_const:
+.long 202313229,202313229,202313229,202313229
+.long 67569157,67569157,67569157,67569157
+.long 1,1,1,1
+.long 27,27,27,27
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
.byte 115,108,46,111,114,103,62,0
+.comm __gnutls_x86_cpuid_s,16
# *** This file is auto-generated ***
#
.text
+
.globl aesni_encrypt
.def aesni_encrypt; .scl 2; .type 32; .endef
.p2align 4
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_enc1_1
+ jnz .Loop_enc1_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rdx)
+ pxor %xmm2,%xmm2
.byte 0xf3,0xc3
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_dec1_2
+ jnz .Loop_dec1_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rdx)
+ pxor %xmm2,%xmm2
+ .byte 0xf3,0xc3
+
+.def _aesni_encrypt2; .scl 3; .type 32; .endef
+.p2align 4
+_aesni_encrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Lenc_loop2:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Lenc_loop2
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ .byte 0xf3,0xc3
+
+.def _aesni_decrypt2; .scl 3; .type 32; .endef
+.p2align 4
+_aesni_decrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Ldec_loop2:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Ldec_loop2
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
.byte 0xf3,0xc3
.def _aesni_encrypt3; .scl 3; .type 32; .endef
.p2align 4
_aesni_encrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
.Lenc_loop3:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop3
.byte 102,15,56,220,209
.p2align 4
_aesni_decrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
.Ldec_loop3:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop3
.byte 102,15,56,222,209
.p2align 4
_aesni_encrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
.Lenc_loop4:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop4
.byte 102,15,56,220,209
.p2align 4
_aesni_decrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
.Ldec_loop4:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop4
.byte 102,15,56,222,209
.p2align 4
_aesni_encrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
+.byte 102,15,56,220,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,220,241
- movups (%rcx),%xmm0
-.byte 102,15,56,220,249
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
jmp .Lenc_loop6_enter
.p2align 4
.Lenc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
+.Lenc_loop6_enter:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lenc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop6
.byte 102,15,56,220,209
.p2align 4
_aesni_decrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
+.byte 102,15,56,222,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,222,241
- movups (%rcx),%xmm0
-.byte 102,15,56,222,249
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
jmp .Ldec_loop6_enter
.p2align 4
.Ldec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
+.Ldec_loop6_enter:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.Ldec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop6
.byte 102,15,56,222,209
.p2align 4
_aesni_encrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,209
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,220,241
pxor %xmm0,%xmm8
-.byte 102,15,56,220,249
+.byte 102,15,56,220,217
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
- jmp .Lenc_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Lenc_loop8_inner
.p2align 4
.Lenc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
+.Lenc_loop8_inner:
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
.Lenc_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop8
.byte 102,15,56,220,209
.p2align 4
_aesni_decrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,222,209
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,222,241
pxor %xmm0,%xmm8
-.byte 102,15,56,222,249
+.byte 102,15,56,222,217
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
- jmp .Ldec_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Ldec_loop8_inner
.p2align 4
.Ldec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
+.Ldec_loop8_inner:
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
.Ldec_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop8
.byte 102,15,56,222,209
movq %r9,%rcx
movq 40(%rsp),%r8
+ leaq -88(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,16(%rsp)
+ movaps %xmm8,32(%rsp)
+ movaps %xmm9,48(%rsp)
+.Lecb_enc_body:
andq $-16,%rdx
jz .Lecb_ret
testl %r8d,%r8d
jz .Lecb_decrypt
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb .Lecb_enc_tail
movdqu (%rdi),%xmm2
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp .Lecb_enc_loop8_enter
.p2align 4
.Lecb_enc_loop8:
call _aesni_encrypt8
- subq $128,%rdx
+ subq $0x80,%rdx
jnc .Lecb_enc_loop8
movups %xmm2,(%rsi)
movups %xmm8,96(%rsi)
movups %xmm9,112(%rsi)
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz .Lecb_ret
.Lecb_enc_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lecb_enc_one
movups 16(%rdi),%xmm3
je .Lecb_enc_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lecb_enc_three
movups 48(%rdi),%xmm5
je .Lecb_enc_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb .Lecb_enc_five
movups 80(%rdi),%xmm7
je .Lecb_enc_six
movdqu 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
call _aesni_encrypt8
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_3
+ jnz .Loop_enc1_3
.byte 102,15,56,221,209
movups %xmm2,(%rsi)
jmp .Lecb_ret
.p2align 4
.Lecb_enc_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
+ call _aesni_encrypt2
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
jmp .Lecb_ret
.p2align 4
.Lecb_decrypt:
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb .Lecb_dec_tail
movdqu (%rdi),%xmm2
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp .Lecb_dec_loop8_enter
.p2align 4
.Lecb_dec_loop8:
call _aesni_decrypt8
movups (%r11),%xmm0
- subq $128,%rdx
+ subq $0x80,%rdx
jnc .Lecb_dec_loop8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movq %r11,%rcx
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movl %r10d,%eax
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
movups %xmm9,112(%rsi)
+ pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz .Lecb_ret
.Lecb_dec_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lecb_dec_one
movups 16(%rdi),%xmm3
je .Lecb_dec_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lecb_dec_three
movups 48(%rdi),%xmm5
je .Lecb_dec_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb .Lecb_dec_five
movups 80(%rdi),%xmm7
je .Lecb_dec_six
movups 96(%rdi),%xmm8
movups (%rcx),%xmm0
+ xorps %xmm9,%xmm9
call _aesni_decrypt8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp .Lecb_ret
.p2align 4
.Lecb_dec_one:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_4
+ jnz .Loop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp .Lecb_ret
.p2align 4
.Lecb_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+ call _aesni_decrypt2
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
jmp .Lecb_ret
.p2align 4
.Lecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
jmp .Lecb_ret
.p2align 4
.Lecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
jmp .Lecb_ret
.p2align 4
.Lecb_dec_five:
xorps %xmm7,%xmm7
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
jmp .Lecb_ret
.p2align 4
.Lecb_dec_six:
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
.Lecb_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp)
+ movaps 16(%rsp),%xmm7
+ movaps %xmm0,16(%rsp)
+ movaps 32(%rsp),%xmm8
+ movaps %xmm0,32(%rsp)
+ movaps 48(%rsp),%xmm9
+ movaps %xmm0,48(%rsp)
+ leaq 88(%rsp),%rsp
+.Lecb_enc_ret:
movq 8(%rsp),%rdi
movq 16(%rsp),%rsi
.byte 0xf3,0xc3
movaps %xmm9,48(%rsp)
.Lccm64_enc_body:
movl 240(%rcx),%eax
- movdqu (%r8),%xmm9
- movdqa .Lincrement64(%rip),%xmm6
+ movdqu (%r8),%xmm6
+ movdqa .Lincrement64(%rip),%xmm9
movdqa .Lbswap_mask(%rip),%xmm7
- shrl $1,%eax
+ shll $4,%eax
+ movl $16,%r10d
leaq 0(%rcx),%r11
movdqu (%r9),%xmm3
- movdqa %xmm9,%xmm2
- movl %eax,%r10d
-.byte 102,68,15,56,0,207
+ movdqa %xmm6,%xmm2
+ leaq 32(%rcx,%rax,1),%rcx
+.byte 102,15,56,0,247
+ subq %rax,%r10
jmp .Lccm64_enc_outer
.p2align 4
.Lccm64_enc_outer:
movups (%r11),%xmm0
- movl %r10d,%eax
+ movq %r10,%rax
movups (%rdi),%xmm8
xorps %xmm0,%xmm2
movups 16(%r11),%xmm1
xorps %xmm8,%xmm0
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm3
- movups (%rcx),%xmm0
+ movups 32(%r11),%xmm0
.Lccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
+ decq %rdx
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decq %rdx
leaq 16(%rdi),%rdi
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
- leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
+ leaq 16(%rsi),%rsi
jnz .Lccm64_enc_outer
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp)
movaps 16(%rsp),%xmm7
+ movaps %xmm0,16(%rsp)
movaps 32(%rsp),%xmm8
+ movaps %xmm0,32(%rsp)
movaps 48(%rsp),%xmm9
+ movaps %xmm0,48(%rsp)
leaq 88(%rsp),%rsp
.Lccm64_enc_ret:
movq 8(%rsp),%rdi
movaps %xmm9,48(%rsp)
.Lccm64_dec_body:
movl 240(%rcx),%eax
- movups (%r8),%xmm9
+ movups (%r8),%xmm6
movdqu (%r9),%xmm3
- movdqa .Lincrement64(%rip),%xmm6
+ movdqa .Lincrement64(%rip),%xmm9
movdqa .Lbswap_mask(%rip),%xmm7
- movaps %xmm9,%xmm2
+ movaps %xmm6,%xmm2
movl %eax,%r10d
movq %rcx,%r11
-.byte 102,68,15,56,0,207
+.byte 102,15,56,0,247
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_5
+ jnz .Loop_enc1_5
.byte 102,15,56,221,209
+ shll $4,%r10d
+ movl $16,%eax
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
leaq 16(%rdi),%rdi
+ subq %r10,%rax
+ leaq 32(%r11,%r10,1),%rcx
+ movq %rax,%r10
jmp .Lccm64_dec_outer
.p2align 4
.Lccm64_dec_outer:
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
- movl %r10d,%eax
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
jz .Lccm64_dec_break
movups (%r11),%xmm0
- shrl $1,%eax
+ movq %r10,%rax
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm2
xorps %xmm8,%xmm3
- movups (%rcx),%xmm0
-
+ movups 32(%r11),%xmm0
+ jmp .Lccm64_dec2_loop
+.p2align 4
.Lccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lccm64_dec2_loop
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leaq 16(%rdi),%rdi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
+ leaq 16(%rdi),%rdi
jmp .Lccm64_dec_outer
.p2align 4
.Lccm64_dec_break:
+ movl 240(%r11),%eax
movups (%r11),%xmm0
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
decl %eax
movups (%r11),%xmm1
leaq 16(%r11),%r11
- jnz .Loop_enc1_6
+ jnz .Loop_enc1_6
.byte 102,15,56,221,217
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp)
movaps 16(%rsp),%xmm7
+ movaps %xmm0,16(%rsp)
movaps 32(%rsp),%xmm8
+ movaps %xmm0,32(%rsp)
movaps 48(%rsp),%xmm9
+ movaps %xmm0,48(%rsp)
leaq 88(%rsp),%rsp
.Lccm64_dec_ret:
movq 8(%rsp),%rdi
movq %r9,%rcx
movq 40(%rsp),%r8
+ cmpq $1,%rdx
+ jne .Lctr32_bulk
+
+
+
+ movups (%r8),%xmm2
+ movups (%rdi),%xmm3
+ movl 240(%rcx),%edx
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_7:
+.byte 102,15,56,220,209
+ decl %edx
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_7
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ xorps %xmm2,%xmm2
+ jmp .Lctr32_epilogue
+
+.p2align 4
+.Lctr32_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $288,%rsp
.Lctr32_body:
leaq -8(%rax),%rbp
- cmpq $1,%rdx
- je .Lctr32_one_shortcut
+
+
movdqu (%r8),%xmm2
movdqu (%rcx),%xmm0
movdqa %xmm2,64(%rsp)
movdqa %xmm2,80(%rsp)
movdqa %xmm2,96(%rsp)
+ movq %rdx,%r10
movdqa %xmm2,112(%rsp)
- movl 240(%rcx),%eax
-
- leaq 1(%r8),%r9
- leaq 2(%r8),%r10
- bswapl %r9d
- bswapl %r10d
- xorl %r11d,%r9d
- xorl %r11d,%r10d
-.byte 102,65,15,58,34,217,3
- leaq 3(%r8),%r9
+ leaq 1(%r8),%rax
+ leaq 2(%r8),%rdx
+ bswapl %eax
+ bswapl %edx
+ xorl %r11d,%eax
+ xorl %r11d,%edx
+.byte 102,15,58,34,216,3
+ leaq 3(%r8),%rax
movdqa %xmm3,16(%rsp)
-.byte 102,65,15,58,34,226,3
- bswapl %r9d
+.byte 102,15,58,34,226,3
+ bswapl %eax
+ movq %r10,%rdx
leaq 4(%r8),%r10
movdqa %xmm4,32(%rsp)
- xorl %r11d,%r9d
+ xorl %r11d,%eax
bswapl %r10d
-.byte 102,65,15,58,34,233,3
+.byte 102,15,58,34,232,3
xorl %r11d,%r10d
movdqa %xmm5,48(%rsp)
leaq 5(%r8),%r9
movl %r10d,64+12(%rsp)
bswapl %r9d
leaq 6(%r8),%r10
+ movl 240(%rcx),%eax
xorl %r11d,%r9d
bswapl %r10d
movl %r9d,80+12(%rsp)
leaq 7(%r8),%r9
movl %r10d,96+12(%rsp)
bswapl %r9d
+ movl _gnutls_x86_cpuid_s+4(%rip),%r10d
xorl %r11d,%r9d
+ andl $71303168,%r10d
movl %r9d,112+12(%rsp)
movups 16(%rcx),%xmm1
cmpq $8,%rdx
jb .Lctr32_tail
+ subq $6,%rdx
+ cmpl $4194304,%r10d
+ je .Lctr32_6x
+
leaq 128(%rcx),%rcx
- subq $8,%rdx
+ subq $2,%rdx
jmp .Lctr32_loop8
+.p2align 4
+.Lctr32_6x:
+ shll $4,%eax
+ movl $48,%r10d
+ bswapl %r11d
+ leaq 32(%rcx,%rax,1),%rcx
+ subq %rax,%r10
+ jmp .Lctr32_loop6
+
+.p2align 4
+.Lctr32_loop6:
+ addl $6,%r8d
+ movups -48(%rcx,%r10,1),%xmm0
+.byte 102,15,56,220,209
+ movl %r8d,%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,217
+.byte 0x0f,0x38,0xf1,0x44,0x24,12
+ leal 1(%r8),%eax
+.byte 102,15,56,220,225
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,28
+.byte 102,15,56,220,233
+ leal 2(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,241
+.byte 0x0f,0x38,0xf1,0x44,0x24,44
+ leal 3(%r8),%eax
+.byte 102,15,56,220,249
+ movups -32(%rcx,%r10,1),%xmm1
+ xorl %r11d,%eax
+
+.byte 102,15,56,220,208
+.byte 0x0f,0x38,0xf1,0x44,0x24,60
+ leal 4(%r8),%eax
+.byte 102,15,56,220,216
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,76
+.byte 102,15,56,220,224
+ leal 5(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,232
+.byte 0x0f,0x38,0xf1,0x44,0x24,92
+ movq %r10,%rax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%rcx,%r10,1),%xmm0
+
+ call .Lenc_loop6
+
+ movdqu (%rdi),%xmm8
+ movdqu 16(%rdi),%xmm9
+ movdqu 32(%rdi),%xmm10
+ movdqu 48(%rdi),%xmm11
+ movdqu 64(%rdi),%xmm12
+ movdqu 80(%rdi),%xmm13
+ leaq 96(%rdi),%rdi
+ movups -64(%rcx,%r10,1),%xmm1
+ pxor %xmm2,%xmm8
+ movaps 0(%rsp),%xmm2
+ pxor %xmm3,%xmm9
+ movaps 16(%rsp),%xmm3
+ pxor %xmm4,%xmm10
+ movaps 32(%rsp),%xmm4
+ pxor %xmm5,%xmm11
+ movaps 48(%rsp),%xmm5
+ pxor %xmm6,%xmm12
+ movaps 64(%rsp),%xmm6
+ pxor %xmm7,%xmm13
+ movaps 80(%rsp),%xmm7
+ movdqu %xmm8,(%rsi)
+ movdqu %xmm9,16(%rsi)
+ movdqu %xmm10,32(%rsi)
+ movdqu %xmm11,48(%rsi)
+ movdqu %xmm12,64(%rsi)
+ movdqu %xmm13,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ subq $6,%rdx
+ jnc .Lctr32_loop6
+
+ addq $6,%rdx
+ jz .Lctr32_done
+
+ leal -48(%r10),%eax
+ leaq -80(%rcx,%r10,1),%rcx
+ negl %eax
+ shrl $4,%eax
+ jmp .Lctr32_tail
+
.p2align 5
.Lctr32_loop8:
addl $8,%r8d
movups 32-128(%rcx),%xmm0
.byte 102,15,56,220,225
xorl %r11d,%r9d
+ nop
.byte 102,15,56,220,233
movl %r9d,0+12(%rsp)
leaq 1(%r8),%r9
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 48-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,16+12(%rsp)
leaq 2(%r8),%r9
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 64-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,32+12(%rsp)
leaq 3(%r8),%r9
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 80-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,48+12(%rsp)
leaq 4(%r8),%r9
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 96-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,64+12(%rsp)
leaq 5(%r8),%r9
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 112-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,80+12(%rsp)
leaq 6(%r8),%r9
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 128-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,96+12(%rsp)
leaq 7(%r8),%r9
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 144-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
.byte 102,15,56,220,224
xorl %r11d,%r9d
+ movdqu 0(%rdi),%xmm10
.byte 102,15,56,220,232
movl %r9d,112+12(%rsp)
+ cmpl $11,%eax
.byte 102,15,56,220,240
.byte 102,15,56,220,248
.byte 102,68,15,56,220,192
- movdqu 0(%rdi),%xmm10
.byte 102,68,15,56,220,200
movups 160-128(%rcx),%xmm0
- cmpl $11,%eax
jb .Lctr32_enc_done
.byte 102,15,56,220,209
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 224-128(%rcx),%xmm0
+ jmp .Lctr32_enc_done
+.p2align 4
.Lctr32_enc_done:
movdqu 16(%rdi),%xmm11
pxor %xmm0,%xmm10
pxor %xmm0,%xmm13
movdqu 80(%rdi),%xmm15
pxor %xmm0,%xmm14
-.byte 102,15,56,220,209
pxor %xmm0,%xmm15
+.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movdqu 96(%rdi),%xmm1
+ leaq 128(%rdi),%rdi
.byte 102,65,15,56,221,210
pxor %xmm0,%xmm1
- movdqu 112(%rdi),%xmm10
- leaq 128(%rdi),%rdi
+ movdqu 112-128(%rdi),%xmm10
.byte 102,65,15,56,221,219
pxor %xmm0,%xmm10
movdqa 0(%rsp),%xmm11
.byte 102,65,15,56,221,228
- movdqa 16(%rsp),%xmm12
.byte 102,65,15,56,221,237
+ movdqa 16(%rsp),%xmm12
movdqa 32(%rsp),%xmm13
.byte 102,65,15,56,221,246
- movdqa 48(%rsp),%xmm14
.byte 102,65,15,56,221,255
+ movdqa 48(%rsp),%xmm14
movdqa 64(%rsp),%xmm15
.byte 102,68,15,56,221,193
movdqa 80(%rsp),%xmm0
-.byte 102,69,15,56,221,202
movups 16-128(%rcx),%xmm1
+.byte 102,69,15,56,221,202
movups %xmm2,(%rsi)
movdqa %xmm11,%xmm2
leaq -128(%rcx),%rcx
.Lctr32_tail:
+
+
leaq 16(%rcx),%rcx
cmpq $4,%rdx
jb .Lctr32_loop3
je .Lctr32_loop4
+
+ shll $4,%eax
movdqa 96(%rsp),%xmm8
pxor %xmm9,%xmm9
movups 16(%rcx),%xmm0
.byte 102,15,56,220,209
- leaq 16(%rcx),%rcx
.byte 102,15,56,220,217
- shrl $1,%eax
+ leaq 32-16(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,220,225
- decl %eax
-.byte 102,15,56,220,233
+ addq $16,%rax
movups (%rdi),%xmm10
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
movups 16(%rdi),%xmm11
-.byte 102,15,56,220,249
movups 32(%rdi),%xmm12
+.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
- movups 16(%rcx),%xmm1
call .Lenc_loop8_enter
.Lctr32_loop4:
.byte 102,15,56,220,209
leaq 16(%rcx),%rcx
+ decl %eax
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.byte 102,15,56,220,233
movups (%rcx),%xmm1
- decl %eax
jnz .Lctr32_loop4
.byte 102,15,56,221,209
- movups (%rdi),%xmm10
.byte 102,15,56,221,217
+ movups (%rdi),%xmm10
movups 16(%rdi),%xmm11
.byte 102,15,56,221,225
- movups 32(%rdi),%xmm12
.byte 102,15,56,221,233
+ movups 32(%rdi),%xmm12
movups 48(%rdi),%xmm13
xorps %xmm10,%xmm2
.Lctr32_loop3:
.byte 102,15,56,220,209
leaq 16(%rcx),%rcx
+ decl %eax
.byte 102,15,56,220,217
.byte 102,15,56,220,225
movups (%rcx),%xmm1
- decl %eax
jnz .Lctr32_loop3
.byte 102,15,56,221,209
.byte 102,15,56,221,217
movups 32(%rdi),%xmm12
xorps %xmm12,%xmm4
movups %xmm4,32(%rsi)
- jmp .Lctr32_done
-.p2align 4
-.Lctr32_one_shortcut:
- movups (%r8),%xmm2
- movups (%rdi),%xmm10
- movl 240(%rcx),%eax
- movups (%rcx),%xmm0
- movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
- xorps %xmm0,%xmm2
-.Loop_enc1_7:
-.byte 102,15,56,220,209
- decl %eax
- movups (%rcx),%xmm1
- leaq 16(%rcx),%rcx
- jnz .Loop_enc1_7
-.byte 102,15,56,221,209
- xorps %xmm10,%xmm2
- movups %xmm2,(%rsi)
- jmp .Lctr32_done
-
-.p2align 4
.Lctr32_done:
+ xorps %xmm0,%xmm0
+ xorl %r11d,%r11d
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
movaps -160(%rbp),%xmm6
+ movaps %xmm0,-160(%rbp)
movaps -144(%rbp),%xmm7
+ movaps %xmm0,-144(%rbp)
movaps -128(%rbp),%xmm8
+ movaps %xmm0,-128(%rbp)
movaps -112(%rbp),%xmm9
+ movaps %xmm0,-112(%rbp)
movaps -96(%rbp),%xmm10
+ movaps %xmm0,-96(%rbp)
movaps -80(%rbp),%xmm11
+ movaps %xmm0,-80(%rbp)
movaps -64(%rbp),%xmm12
+ movaps %xmm0,-64(%rbp)
movaps -48(%rbp),%xmm13
+ movaps %xmm0,-48(%rbp)
movaps -32(%rbp),%xmm14
+ movaps %xmm0,-32(%rbp)
movaps -16(%rbp),%xmm15
+ movaps %xmm0,-16(%rbp)
+ movaps %xmm0,0(%rsp)
+ movaps %xmm0,16(%rsp)
+ movaps %xmm0,32(%rsp)
+ movaps %xmm0,48(%rsp)
+ movaps %xmm0,64(%rsp)
+ movaps %xmm0,80(%rsp)
+ movaps %xmm0,96(%rsp)
+ movaps %xmm0,112(%rsp)
leaq (%rbp),%rsp
popq %rbp
.Lctr32_epilogue:
leaq (%rsp),%rax
pushq %rbp
- subq $256,%rsp
+ subq $272,%rsp
andq $-16,%rsp
movaps %xmm6,-168(%rax)
movaps %xmm7,-152(%rax)
movaps %xmm15,-24(%rax)
.Lxts_enc_body:
leaq -8(%rax),%rbp
- movups (%r9),%xmm15
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
.Loop_enc1_8:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_enc1_8
-.byte 102,68,15,56,221,249
+ jnz .Loop_enc1_8
+.byte 102,15,56,221,209
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa .Lxts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $0x5f,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc .Lxts_enc_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq .Lxts_magic(%rip),%r8
jmp .Lxts_enc_grandloop
-.p2align 4
+.p2align 5
.Lxts_enc_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,220,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,220,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,220,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,220,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,220,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,220,208
+ pxor %xmm9,%xmm13
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,220,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,220,216
+ pxor %xmm9,%xmm14
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,220,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,220,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,220,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp .Lxts_enc_loop6_enter
-
-.p2align 4
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $0x5f,%xmm15,%xmm9
+ jmp .Lxts_enc_loop6
+.p2align 5
.Lxts_enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lxts_enc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -80(%rcx,%rax,1),%xmm0
jnz .Lxts_enc_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
.byte 102,15,56,220,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,220,249
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,208
- pand %xmm8,%xmm9
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,220,216
- pcmpgtd %xmm15,%xmm14
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,220,224
- pxor %xmm9,%xmm15
.byte 102,15,56,220,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,248
- movups 32(%rcx),%xmm0
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,249
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,220,216
paddq %xmm15,%xmm15
-.byte 102,15,56,221,208
- pand %xmm8,%xmm9
-.byte 102,15,56,221,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,221,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,220,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,220,217
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,221,84,36,0
+ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+.byte 102,15,56,221,92,36,16
+.byte 102,15,56,221,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,221,108,36,48
+.byte 102,15,56,221,116,36,64
+.byte 102,15,56,221,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc .Lxts_enc_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
.Lxts_enc_short:
+
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
addq $96,%rdx
jz .Lxts_enc_done
- cmpq $32,%rdx
+ pxor %xmm0,%xmm11
+ cmpq $0x20,%rdx
jb .Lxts_enc_one
+ pxor %xmm0,%xmm12
je .Lxts_enc_two
- cmpq $64,%rdx
+ pxor %xmm0,%xmm13
+ cmpq $0x40,%rdx
jb .Lxts_enc_three
+ pxor %xmm0,%xmm14
je .Lxts_enc_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
pxor %xmm13,%xmm5
pxor %xmm14,%xmm6
+ pxor %xmm7,%xmm7
call _aesni_encrypt6
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_9
+ jnz .Loop_enc1_9
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_encrypt3
+ call _aesni_encrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
call _aesni_encrypt4
- xorps %xmm10,%xmm2
- movdqa %xmm15,%xmm10
- xorps %xmm11,%xmm3
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_enc_done
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_10
+ jnz .Loop_enc1_10
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movups %xmm2,-16(%rsi)
.Lxts_enc_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
movaps -160(%rbp),%xmm6
+ movaps %xmm0,-160(%rbp)
movaps -144(%rbp),%xmm7
+ movaps %xmm0,-144(%rbp)
movaps -128(%rbp),%xmm8
+ movaps %xmm0,-128(%rbp)
movaps -112(%rbp),%xmm9
+ movaps %xmm0,-112(%rbp)
movaps -96(%rbp),%xmm10
+ movaps %xmm0,-96(%rbp)
movaps -80(%rbp),%xmm11
+ movaps %xmm0,-80(%rbp)
movaps -64(%rbp),%xmm12
+ movaps %xmm0,-64(%rbp)
movaps -48(%rbp),%xmm13
+ movaps %xmm0,-48(%rbp)
movaps -32(%rbp),%xmm14
+ movaps %xmm0,-32(%rbp)
movaps -16(%rbp),%xmm15
+ movaps %xmm0,-16(%rbp)
+ movaps %xmm0,0(%rsp)
+ movaps %xmm0,16(%rsp)
+ movaps %xmm0,32(%rsp)
+ movaps %xmm0,48(%rsp)
+ movaps %xmm0,64(%rsp)
+ movaps %xmm0,80(%rsp)
+ movaps %xmm0,96(%rsp)
leaq (%rbp),%rsp
popq %rbp
.Lxts_enc_epilogue:
leaq (%rsp),%rax
pushq %rbp
- subq $256,%rsp
+ subq $272,%rsp
andq $-16,%rsp
movaps %xmm6,-168(%rax)
movaps %xmm7,-152(%rax)
movaps %xmm15,-24(%rax)
.Lxts_dec_body:
leaq -8(%rax),%rbp
- movups (%r9),%xmm15
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
.Loop_enc1_11:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_enc1_11
-.byte 102,68,15,56,221,249
+ jnz .Loop_enc1_11
+.byte 102,15,56,221,209
xorl %eax,%eax
testq $15,%rdx
setnz %al
shlq $4,%rax
subq %rax,%rdx
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa .Lxts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $0x5f,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc .Lxts_dec_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq .Lxts_magic(%rip),%r8
jmp .Lxts_dec_grandloop
-.p2align 4
+.p2align 5
.Lxts_dec_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,222,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,222,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,222,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,222,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,222,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,222,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,222,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,222,208
+ pxor %xmm9,%xmm13
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,222,216
+ pxor %xmm9,%xmm14
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,222,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,222,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp .Lxts_dec_loop6_enter
-
-.p2align 4
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $0x5f,%xmm15,%xmm9
+ jmp .Lxts_dec_loop6
+.p2align 5
.Lxts_dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.Lxts_dec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
+ movups -80(%rcx,%rax,1),%xmm0
jnz .Lxts_dec_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
.byte 102,15,56,222,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,222,249
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,208
- pand %xmm8,%xmm9
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,222,216
- pcmpgtd %xmm15,%xmm14
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,222,224
- pxor %xmm9,%xmm15
.byte 102,15,56,222,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,248
- movups 32(%rcx),%xmm0
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,249
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,222,216
paddq %xmm15,%xmm15
-.byte 102,15,56,223,208
- pand %xmm8,%xmm9
-.byte 102,15,56,223,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,223,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,222,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,222,217
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,223,84,36,0
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+.byte 102,15,56,223,92,36,16
+.byte 102,15,56,223,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,223,108,36,48
+.byte 102,15,56,223,116,36,64
+.byte 102,15,56,223,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc .Lxts_dec_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
.Lxts_dec_short:
+
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
addq $96,%rdx
jz .Lxts_dec_done
- cmpq $32,%rdx
+ pxor %xmm0,%xmm12
+ cmpq $0x20,%rdx
jb .Lxts_dec_one
+ pxor %xmm0,%xmm13
je .Lxts_dec_two
- cmpq $64,%rdx
+ pxor %xmm0,%xmm14
+ cmpq $0x40,%rdx
jb .Lxts_dec_three
je .Lxts_dec_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
pcmpgtd %xmm15,%xmm14
movdqu %xmm6,64(%rsi)
leaq 80(%rsi),%rsi
- pshufd $19,%xmm14,%xmm11
+ pshufd $0x13,%xmm14,%xmm11
andq $15,%r9
jz .Lxts_dec_ret
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_12
+ jnz .Loop_dec1_12
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_decrypt3
+ call _aesni_decrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
xorps %xmm10,%xmm2
movdqa %xmm13,%xmm10
xorps %xmm11,%xmm3
- movdqa %xmm15,%xmm11
+ movdqa %xmm14,%xmm11
xorps %xmm12,%xmm4
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
.p2align 4
.Lxts_dec_four:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movups (%rdi),%xmm2
- pand %xmm8,%xmm9
movups 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movups 32(%rdi),%xmm4
xorps %xmm10,%xmm2
movups 48(%rdi),%xmm5
call _aesni_decrypt4
- xorps %xmm10,%xmm2
+ pxor %xmm10,%xmm2
movdqa %xmm14,%xmm10
- xorps %xmm11,%xmm3
+ pxor %xmm11,%xmm3
movdqa %xmm15,%xmm11
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_dec_done
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_13
+ jnz .Loop_dec1_13
.byte 102,15,56,223,209
xorps %xmm11,%xmm2
movups %xmm2,(%rsi)
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_14
+ jnz .Loop_dec1_14
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movups %xmm2,(%rsi)
.Lxts_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
movaps -160(%rbp),%xmm6
+ movaps %xmm0,-160(%rbp)
movaps -144(%rbp),%xmm7
+ movaps %xmm0,-144(%rbp)
movaps -128(%rbp),%xmm8
+ movaps %xmm0,-128(%rbp)
movaps -112(%rbp),%xmm9
+ movaps %xmm0,-112(%rbp)
movaps -96(%rbp),%xmm10
+ movaps %xmm0,-96(%rbp)
movaps -80(%rbp),%xmm11
+ movaps %xmm0,-80(%rbp)
movaps -64(%rbp),%xmm12
+ movaps %xmm0,-64(%rbp)
movaps -48(%rbp),%xmm13
+ movaps %xmm0,-48(%rbp)
movaps -32(%rbp),%xmm14
+ movaps %xmm0,-32(%rbp)
movaps -16(%rbp),%xmm15
+ movaps %xmm0,-16(%rbp)
+ movaps %xmm0,0(%rsp)
+ movaps %xmm0,16(%rsp)
+ movaps %xmm0,32(%rsp)
+ movaps %xmm0,48(%rsp)
+ movaps %xmm0,64(%rsp)
+ movaps %xmm0,80(%rsp)
+ movaps %xmm0,96(%rsp)
leaq (%rbp),%rsp
popq %rbp
.Lxts_dec_epilogue:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_15
+ jnz .Loop_enc1_15
.byte 102,15,56,221,209
movl %r10d,%eax
movq %r11,%rcx
jnc .Lcbc_enc_loop
addq $16,%rdx
jnz .Lcbc_enc_tail
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%r8)
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
jmp .Lcbc_ret
.Lcbc_enc_tail:
movq %rdx,%rcx
xchgq %rdi,%rsi
-.long 0x9066A4F3
+.long 0x9066A4F3
movl $16,%ecx
subq %rdx,%rcx
xorl %eax,%eax
-.long 0x9066AAF3
+.long 0x9066AAF3
leaq -16(%rdi),%rdi
movl %r10d,%eax
movq %rdi,%rsi
movq %r11,%rcx
xorq %rdx,%rdx
- jmp .Lcbc_enc_loop
+ jmp .Lcbc_enc_loop
.p2align 4
.Lcbc_decrypt:
+ cmpq $16,%rdx
+ jne .Lcbc_decrypt_bulk
+
+
+
+ movdqu (%rdi),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa %xmm2,%xmm4
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_16:
+.byte 102,15,56,222,209
+ decl %r10d
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_16
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movdqu %xmm4,(%r8)
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp .Lcbc_ret
+.p2align 4
+.Lcbc_decrypt_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $176,%rsp
leaq -8(%rax),%rbp
movups (%r8),%xmm10
movl %r10d,%eax
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe .Lcbc_dec_tail
movups (%rcx),%xmm0
movdqa %xmm5,%xmm14
movdqu 80(%rdi),%xmm7
movdqa %xmm6,%xmm15
- cmpq $112,%rdx
+ movl _gnutls_x86_cpuid_s+4(%rip),%r9d
+ cmpq $0x70,%rdx
jbe .Lcbc_dec_six_or_seven
- subq $112,%rdx
+ andl $71303168,%r9d
+ subq $0x50,%rdx
+ cmpl $4194304,%r9d
+ je .Lcbc_dec_loop6_enter
+ subq $0x20,%rdx
leaq 112(%rcx),%rcx
jmp .Lcbc_dec_loop8_enter
.p2align 4
movups 16-112(%rcx),%xmm1
pxor %xmm0,%xmm4
xorq %r11,%r11
- cmpq $112,%rdx
+ cmpq $0x70,%rdx
pxor %xmm0,%xmm5
pxor %xmm0,%xmm6
pxor %xmm0,%xmm7
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
- setnc %r11b
.byte 102,68,15,56,222,193
+ setnc %r11b
shlq $7,%r11
.byte 102,68,15,56,222,201
addq %rdi,%r11
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 64-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 80-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 96-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 112-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 128-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 144-112(%rcx),%xmm1
+ cmpl $11,%eax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 160-112(%rcx),%xmm0
- cmpl $11,%eax
jb .Lcbc_dec_done
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 176-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 208-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 224-112(%rcx),%xmm0
+ jmp .Lcbc_dec_done
+.p2align 4
.Lcbc_dec_done:
.byte 102,15,56,222,209
- pxor %xmm0,%xmm10
.byte 102,15,56,222,217
+ pxor %xmm0,%xmm10
pxor %xmm0,%xmm11
.byte 102,15,56,222,225
- pxor %xmm0,%xmm12
.byte 102,15,56,222,233
+ pxor %xmm0,%xmm12
pxor %xmm0,%xmm13
.byte 102,15,56,222,241
- pxor %xmm0,%xmm14
.byte 102,15,56,222,249
+ pxor %xmm0,%xmm14
pxor %xmm0,%xmm15
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
.byte 102,65,15,56,223,219
pxor %xmm0,%xmm10
movdqu 112(%rdi),%xmm0
- leaq 128(%rdi),%rdi
.byte 102,65,15,56,223,228
+ leaq 128(%rdi),%rdi
movdqu 0(%r11),%xmm11
.byte 102,65,15,56,223,237
- movdqu 16(%r11),%xmm12
.byte 102,65,15,56,223,246
+ movdqu 16(%r11),%xmm12
movdqu 32(%r11),%xmm13
.byte 102,65,15,56,223,255
- movdqu 48(%r11),%xmm14
.byte 102,68,15,56,223,193
+ movdqu 48(%r11),%xmm14
movdqu 64(%r11),%xmm15
.byte 102,69,15,56,223,202
movdqa %xmm0,%xmm10
movups %xmm8,96(%rsi)
leaq 112(%rsi),%rsi
- subq $128,%rdx
+ subq $0x80,%rdx
ja .Lcbc_dec_loop8
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
- addq $112,%rdx
- jle .Lcbc_dec_tail_collected
+ addq $0x70,%rdx
+ jle .Lcbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe .Lcbc_dec_tail
movaps %xmm11,%xmm2
.Lcbc_dec_six_or_seven:
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
ja .Lcbc_dec_seven
movaps %xmm7,%xmm8
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
leaq 80(%rsi),%rsi
movdqa %xmm7,%xmm2
+ pxor %xmm7,%xmm7
jmp .Lcbc_dec_tail_collected
.p2align 4
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
pxor %xmm9,%xmm8
movdqu %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
leaq 96(%rsi),%rsi
movdqa %xmm8,%xmm2
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp .Lcbc_dec_tail_collected
+.p2align 4
+.Lcbc_dec_loop6:
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+.Lcbc_dec_loop6_enter:
+ leaq 96(%rdi),%rdi
+ movdqa %xmm7,%xmm8
+
+ call _aesni_decrypt6
+
+ pxor %xmm10,%xmm2
+ movdqa %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movq %r11,%rcx
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movl %r10d,%eax
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ subq $0x60,%rdx
+ ja .Lcbc_dec_loop6
+
+ movdqa %xmm7,%xmm2
+ addq $0x50,%rdx
+ jle .Lcbc_dec_clear_tail_collected
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+
.Lcbc_dec_tail:
movups (%rdi),%xmm2
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_one
movups 16(%rdi),%xmm3
movaps %xmm2,%xmm11
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_two
movups 32(%rdi),%xmm4
movaps %xmm3,%xmm12
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_three
movups 48(%rdi),%xmm5
movaps %xmm4,%xmm13
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_four
movups 64(%rdi),%xmm6
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
leaq 64(%rsi),%rsi
movdqa %xmm6,%xmm2
- subq $16,%rdx
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ subq $0x10,%rdx
jmp .Lcbc_dec_tail_collected
.p2align 4
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
-.Loop_dec1_16:
+.Loop_dec1_17:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_16
+ jnz .Loop_dec1_17
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movaps %xmm11,%xmm10
.p2align 4
.Lcbc_dec_two:
movaps %xmm3,%xmm12
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+ call _aesni_decrypt2
pxor %xmm10,%xmm2
movaps %xmm12,%xmm10
pxor %xmm11,%xmm3
movdqu %xmm2,(%rsi)
movdqa %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leaq 16(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.p2align 4
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movdqa %xmm4,%xmm2
+ pxor %xmm4,%xmm4
leaq 32(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.p2align 4
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movdqa %xmm5,%xmm2
+ pxor %xmm5,%xmm5
leaq 48(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.p2align 4
+.Lcbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
.Lcbc_dec_tail_collected:
movups %xmm10,(%r8)
andq $15,%rdx
jnz .Lcbc_dec_tail_partial
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp .Lcbc_dec_ret
.p2align 4
.Lcbc_dec_tail_partial:
movaps %xmm2,(%rsp)
+ pxor %xmm2,%xmm2
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
leaq (%rsp),%rsi
-.long 0x9066A4F3
+.long 0x9066A4F3
+ movdqa %xmm2,(%rsp)
.Lcbc_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movaps 16(%rsp),%xmm6
+ movaps %xmm0,16(%rsp)
movaps 32(%rsp),%xmm7
+ movaps %xmm0,32(%rsp)
movaps 48(%rsp),%xmm8
+ movaps %xmm0,48(%rsp)
movaps 64(%rsp),%xmm9
+ movaps %xmm0,64(%rsp)
movaps 80(%rsp),%xmm10
+ movaps %xmm0,80(%rsp)
movaps 96(%rsp),%xmm11
+ movaps %xmm0,96(%rsp)
movaps 112(%rsp),%xmm12
+ movaps %xmm0,112(%rsp)
movaps 128(%rsp),%xmm13
+ movaps %xmm0,128(%rsp)
movaps 144(%rsp),%xmm14
+ movaps %xmm0,144(%rsp)
movaps 160(%rsp),%xmm15
+ movaps %xmm0,160(%rsp)
leaq (%rbp),%rsp
popq %rbp
.Lcbc_ret:
.def aesni_set_decrypt_key; .scl 2; .type 32; .endef
.p2align 4
aesni_set_decrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
call __aesni_set_encrypt_key
shll $4,%edx
testl %eax,%eax
movups (%r8),%xmm0
.byte 102,15,56,219,192
+ pxor %xmm1,%xmm1
movups %xmm0,(%rcx)
+ pxor %xmm0,%xmm0
.Ldec_key_ret:
addq $8,%rsp
.byte 0xf3,0xc3
.p2align 4
aesni_set_encrypt_key:
__aesni_set_encrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
movq $-1,%rax
testq %rcx,%rcx
jz .Lenc_key_ret
testq %r8,%r8
jz .Lenc_key_ret
+ movl $268437504,%r10d
movups (%rcx),%xmm0
xorps %xmm4,%xmm4
+ andl _gnutls_x86_cpuid_s+4(%rip),%r10d
leaq 16(%r8),%rax
cmpl $256,%edx
je .L14rounds
.L10rounds:
movl $9,%edx
+ cmpl $268435456,%r10d
+ je .L10rounds_alt
+
movups %xmm0,(%r8)
.byte 102,15,58,223,200,1
call .Lkey_expansion_128_cold
xorl %eax,%eax
jmp .Lenc_key_ret
+.p2align 4
+.L10rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movl $8,%r10d
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,(%r8)
+ jmp .Loop_key128
+
+.p2align 4
+.Loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leaq 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ decl %r10d
+ jnz .Loop_key128
+
+ movdqa .Lkey_rcon1b(%rip),%xmm4
+
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ movl %edx,96(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
.p2align 4
.L12rounds:
movq 16(%rcx),%xmm2
movl $11,%edx
+ cmpl $268435456,%r10d
+ je .L12rounds_alt
+
movups %xmm0,(%r8)
.byte 102,15,58,223,202,1
call .Lkey_expansion_192a_cold
xorq %rax,%rax
jmp .Lenc_key_ret
+.p2align 4
+.L12rounds_alt:
+ movdqa .Lkey_rotate192(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $8,%r10d
+ movdqu %xmm0,(%r8)
+ jmp .Loop_key192
+
+.p2align 4
+.Loop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leaq 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd $0xff,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ decl %r10d
+ jnz .Loop_key192
+
+ movl %edx,32(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
.p2align 4
.L14rounds:
movups 16(%rcx),%xmm2
movl $13,%edx
leaq 16(%rax),%rax
+ cmpl $268435456,%r10d
+ je .L14rounds_alt
+
movups %xmm0,(%r8)
movups %xmm2,16(%r8)
.byte 102,15,58,223,202,1
xorq %rax,%rax
jmp .Lenc_key_ret
+.p2align 4
+.L14rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $7,%r10d
+ movdqu %xmm0,0(%r8)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16(%r8)
+ jmp .Loop_key256
+
+.p2align 4
+.Loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ decl %r10d
+ jz .Ldone_key256
+
+ pshufd $0xff,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ leaq 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp .Loop_key256
+
+.Ldone_key256:
+ movl %edx,16(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
.p2align 4
.Lbad_keybits:
movq $-2,%rax
.Lenc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
addq $8,%rsp
.byte 0xf3,0xc3
.LSEH_end_set_encrypt_key:
.long 0x87,0,1,0
.Lincrement1:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+.long 0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+.long 1,1,1,1
+.Lkey_rcon1b:
+.long 0x1b,0x1b,0x1b,0x1b
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6
-.def ecb_se_handler; .scl 3; .type 32; .endef
-.p2align 4
-ecb_se_handler:
- pushq %rsi
- pushq %rdi
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushfq
- subq $64,%rsp
-
- movq 152(%r8),%rax
-
- jmp .Lcommon_seh_tail
-
-
-.def ccm64_se_handler; .scl 3; .type 32; .endef
+.def ecb_ccm64_se_handler; .scl 3; .type 32; .endef
.p2align 4
-ccm64_se_handler:
+ecb_ccm64_se_handler:
pushq %rsi
pushq %rdi
pushq %rbx
leaq 0(%rax),%rsi
leaq 512(%r8),%rdi
movl $8,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
leaq 88(%rax),%rax
jmp .Lcommon_seh_tail
leaq -160(%rax),%rsi
leaq 512(%r8),%rdi
movl $20,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
jmp .Lcommon_rbp_tail
movq 152(%r8),%rax
movq 248(%r8),%rbx
- leaq .Lcbc_decrypt(%rip),%r10
+ leaq .Lcbc_decrypt_bulk(%rip),%r10
cmpq %r10,%rbx
jb .Lcommon_seh_tail
leaq 16(%rax),%rsi
leaq 512(%r8),%rdi
movl $20,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
.Lcommon_rbp_tail:
movq 160(%r8),%rax
movq 40(%r9),%rdi
movq %r8,%rsi
movl $154,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
movq %r9,%rsi
xorq %rcx,%rcx
.p2align 3
.LSEH_info_ecb:
.byte 9,0,0,0
-.rva ecb_se_handler
+.rva ecb_ccm64_se_handler
+.rva .Lecb_enc_body,.Lecb_enc_ret
.LSEH_info_ccm64_enc:
.byte 9,0,0,0
-.rva ccm64_se_handler
-.rva .Lccm64_enc_body,.Lccm64_enc_ret
+.rva ecb_ccm64_se_handler
+.rva .Lccm64_enc_body,.Lccm64_enc_ret
.LSEH_info_ccm64_dec:
.byte 9,0,0,0
-.rva ccm64_se_handler
-.rva .Lccm64_dec_body,.Lccm64_dec_ret
+.rva ecb_ccm64_se_handler
+.rva .Lccm64_dec_body,.Lccm64_dec_ret
.LSEH_info_ctr32:
.byte 9,0,0,0
.rva ctr_xts_se_handler
-.rva .Lctr32_body,.Lctr32_epilogue
+.rva .Lctr32_body,.Lctr32_epilogue
.LSEH_info_xts_enc:
.byte 9,0,0,0
.rva ctr_xts_se_handler
-.rva .Lxts_enc_body,.Lxts_enc_epilogue
+.rva .Lxts_enc_body,.Lxts_enc_epilogue
.LSEH_info_xts_dec:
.byte 9,0,0,0
.rva ctr_xts_se_handler
-.rva .Lxts_dec_body,.Lxts_dec_epilogue
+.rva .Lxts_dec_body,.Lxts_dec_epilogue
.LSEH_info_cbc:
.byte 9,0,0,0
.rva cbc_se_handler
.LSEH_info_key:
.byte 0x01,0x04,0x01,0x00
-.byte 0x04,0x02,0x00,0x00
+.byte 0x04,0x02,0x00,0x00
#
.text
+
.globl gcm_gmult_4bit
.def gcm_gmult_4bit; .scl 2; .type 32; .endef
.p2align 4
movq $14,%rcx
movq 8(%rsi,%rax,1),%r8
movq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
movq %r8,%rdx
jmp .Loop1
.p2align 4
.Loop1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
movb (%rdi,%rcx,1),%al
shrq $4,%r9
js .Lbreak1
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
.p2align 4
.Lbreak1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rbx,1),%r8
.L_init_clmul:
.LSEH_begin_gcm_init_clmul:
-.byte 0x48,0x83,0xec,0x18
-.byte 0x0f,0x29,0x34,0x24
+.byte 0x48,0x83,0xec,0x18
+.byte 0x0f,0x29,0x34,0x24
movdqu (%rdx),%xmm2
pshufd $78,%xmm2,%xmm2
leaq -136(%rsp),%rax
.LSEH_begin_gcm_ghash_clmul:
-.byte 0x48,0x8d,0x60,0xe0
-.byte 0x0f,0x29,0x70,0xe0
-.byte 0x0f,0x29,0x78,0xf0
-.byte 0x44,0x0f,0x29,0x00
-.byte 0x44,0x0f,0x29,0x48,0x10
-.byte 0x44,0x0f,0x29,0x50,0x20
-.byte 0x44,0x0f,0x29,0x58,0x30
-.byte 0x44,0x0f,0x29,0x60,0x40
-.byte 0x44,0x0f,0x29,0x68,0x50
-.byte 0x44,0x0f,0x29,0x70,0x60
-.byte 0x44,0x0f,0x29,0x78,0x70
- movdqa .Lbswap_mask(%rip),%xmm5
- movq $11547335547999543296,%rax
+.byte 0x48,0x8d,0x60,0xe0
+.byte 0x0f,0x29,0x70,0xe0
+.byte 0x0f,0x29,0x78,0xf0
+.byte 0x44,0x0f,0x29,0x00
+.byte 0x44,0x0f,0x29,0x48,0x10
+.byte 0x44,0x0f,0x29,0x50,0x20
+.byte 0x44,0x0f,0x29,0x58,0x30
+.byte 0x44,0x0f,0x29,0x60,0x40
+.byte 0x44,0x0f,0x29,0x68,0x50
+.byte 0x44,0x0f,0x29,0x70,0x60
+.byte 0x44,0x0f,0x29,0x78,0x70
+ movdqa .Lbswap_mask(%rip),%xmm10
movdqu (%rcx),%xmm0
movdqu (%rdx),%xmm2
- movdqu 32(%rdx),%xmm10
-.byte 102,15,56,0,197
+ movdqu 32(%rdx),%xmm7
+.byte 102,65,15,56,0,194
- subq $16,%r9
+ subq $0x10,%r9
jz .Lodd_tail
- movdqu 16(%rdx),%xmm9
- cmpq $48,%r9
+ movdqu 16(%rdx),%xmm6
+ movl _gnutls_x86_cpuid_s+4(%rip),%eax
+ cmpq $0x30,%r9
jb .Lskip4x
- subq $48,%r9
+ andl $71303168,%eax
+ cmpl $4194304,%eax
+ je .Lskip4x
+
+ subq $0x30,%r9
+ movq $0xA040608020C0E000,%rax
movdqu 48(%rdx),%xmm14
movdqu 64(%rdx),%xmm15
- movdqu 48(%r8),%xmm6
+ movdqu 48(%r8),%xmm3
movdqu 32(%r8),%xmm11
-.byte 102,15,56,0,245
-.byte 102,68,15,56,0,221
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm7
- pxor %xmm6,%xmm7
-.byte 102,15,58,68,242,0
-.byte 102,68,15,58,68,194,17
-.byte 102,65,15,58,68,250,0
+.byte 102,65,15,56,0,218
+.byte 102,69,15,56,0,218
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
pxor %xmm11,%xmm12
-.byte 102,69,15,58,68,217,0
-.byte 102,69,15,58,68,233,17
- xorps %xmm11,%xmm6
-.byte 102,69,15,58,68,226,16
- xorps %xmm13,%xmm8
- movups 80(%rdx),%xmm10
- xorps %xmm12,%xmm7
+.byte 102,68,15,58,68,222,0
+.byte 102,68,15,58,68,238,17
+.byte 102,68,15,58,68,231,16
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
+ movups 80(%rdx),%xmm7
+ xorps %xmm12,%xmm4
movdqu 16(%r8),%xmm11
- movdqu 0(%r8),%xmm3
-.byte 102,68,15,56,0,221
-.byte 102,15,56,0,221
+ movdqu 0(%r8),%xmm8
+.byte 102,69,15,56,0,218
+.byte 102,69,15,56,0,194
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
- pxor %xmm3,%xmm0
+ pxor %xmm8,%xmm0
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm6
-.byte 102,69,15,58,68,226,0
- xorps %xmm13,%xmm8
+.byte 102,68,15,58,68,231,0
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
leaq 64(%r8),%r8
- subq $64,%r9
+ subq $0x40,%r9
jc .Ltail4x
jmp .Lmod4_loop
.p2align 5
.Lmod4_loop:
.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm7
+ xorps %xmm12,%xmm4
movdqu 48(%r8),%xmm11
-.byte 102,68,15,56,0,221
+.byte 102,69,15,56,0,218
.byte 102,65,15,58,68,207,17
- xorps %xmm6,%xmm0
- movdqu 32(%r8),%xmm6
+ xorps %xmm3,%xmm0
+ movdqu 32(%r8),%xmm3
movdqa %xmm11,%xmm13
+.byte 102,68,15,58,68,199,16
pshufd $78,%xmm11,%xmm12
-.byte 102,65,15,58,68,218,16
- xorps %xmm8,%xmm1
+ xorps %xmm5,%xmm1
pxor %xmm11,%xmm12
-.byte 102,15,56,0,245
- movups 32(%rdx),%xmm10
+.byte 102,65,15,56,0,218
+ movups 32(%rdx),%xmm7
+ xorps %xmm4,%xmm8
.byte 102,68,15,58,68,218,0
- xorps %xmm7,%xmm3
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm7
+ pshufd $78,%xmm3,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm6,%xmm7
- pxor %xmm1,%xmm3
- movdqa %xmm3,%xmm4
- pslldq $8,%xmm3
+ pxor %xmm0,%xmm8
+ movdqa %xmm3,%xmm5
+ pxor %xmm1,%xmm8
+ pxor %xmm3,%xmm4
+ movdqa %xmm8,%xmm9
.byte 102,68,15,58,68,234,17
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- movdqa .L7_mask(%rip),%xmm3
- pxor %xmm4,%xmm1
-.byte 102,72,15,110,224
-
- pand %xmm0,%xmm3
-.byte 102,15,56,0,227
-.byte 102,69,15,58,68,226,0
- pxor %xmm0,%xmm4
- psllq $57,%xmm4
- movdqa %xmm4,%xmm3
- pslldq $8,%xmm4
-.byte 102,65,15,58,68,241,0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- movdqu 0(%r8),%xmm3
+ pslldq $8,%xmm8
+ psrldq $8,%xmm9
+ pxor %xmm8,%xmm0
+ movdqa .L7_mask(%rip),%xmm8
+ pxor %xmm9,%xmm1
+.byte 102,76,15,110,200
+
+ pand %xmm0,%xmm8
+.byte 102,69,15,56,0,200
+ pxor %xmm0,%xmm9
+.byte 102,68,15,58,68,231,0
+ psllq $57,%xmm9
+ movdqa %xmm9,%xmm8
+ pslldq $8,%xmm9
+.byte 102,15,58,68,222,0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ movdqu 0(%r8),%xmm8
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm9
psrlq $1,%xmm0
-.byte 102,69,15,58,68,193,17
- xorps %xmm11,%xmm6
+.byte 102,15,58,68,238,17
+ xorps %xmm11,%xmm3
movdqu 16(%r8),%xmm11
-.byte 102,68,15,56,0,221
-.byte 102,65,15,58,68,250,16
- xorps %xmm13,%xmm8
- movups 80(%rdx),%xmm10
-.byte 102,15,56,0,221
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
+.byte 102,69,15,56,0,218
+.byte 102,15,58,68,231,16
+ xorps %xmm13,%xmm5
+ movups 80(%rdx),%xmm7
+.byte 102,69,15,56,0,194
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
psrlq $5,%xmm0
movdqa %xmm11,%xmm13
- pxor %xmm12,%xmm7
+ pxor %xmm12,%xmm4
pshufd $78,%xmm11,%xmm12
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
psrlq $1,%xmm0
-.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm6
pxor %xmm1,%xmm0
-
-.byte 102,69,15,58,68,226,0
- xorps %xmm13,%xmm8
-
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+
+.byte 102,68,15,58,68,231,0
+ xorps %xmm13,%xmm5
leaq 64(%r8),%r8
- subq $64,%r9
+ subq $0x40,%r9
jnc .Lmod4_loop
.Ltail4x:
.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm7
.byte 102,65,15,58,68,207,17
- xorps %xmm6,%xmm0
-.byte 102,65,15,58,68,218,16
- xorps %xmm8,%xmm1
+.byte 102,68,15,58,68,199,16
+ xorps %xmm12,%xmm4
+ xorps %xmm3,%xmm0
+ xorps %xmm5,%xmm1
pxor %xmm0,%xmm1
- pxor %xmm7,%xmm3
+ pxor %xmm4,%xmm8
- pxor %xmm1,%xmm3
+ pxor %xmm1,%xmm8
pxor %xmm0,%xmm1
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
+ movdqa %xmm8,%xmm9
+ psrldq $8,%xmm8
+ pslldq $8,%xmm9
+ pxor %xmm8,%xmm1
+ pxor %xmm9,%xmm0
movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- addq $64,%r9
+ addq $0x40,%r9
jz .Ldone
- movdqu 32(%rdx),%xmm10
- subq $16,%r9
+ movdqu 32(%rdx),%xmm7
+ subq $0x10,%r9
jz .Lodd_tail
.Lskip4x:
- movdqu (%r8),%xmm3
- movdqu 16(%r8),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
- pxor %xmm3,%xmm0
+ movdqu (%r8),%xmm8
+ movdqu 16(%r8),%xmm3
+.byte 102,69,15,56,0,194
+.byte 102,65,15,56,0,218
+ pxor %xmm8,%xmm0
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm3
- pxor %xmm6,%xmm3
-.byte 102,15,58,68,242,0
-.byte 102,68,15,58,68,194,17
-.byte 102,65,15,58,68,218,0
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
leaq 32(%r8),%r8
- subq $32,%r9
+ nop
+ subq $0x20,%r9
jbe .Leven_tail
+ nop
jmp .Lmod_loop
.p2align 5
.Lmod_loop:
movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,65,15,58,68,193,0
-.byte 102,65,15,58,68,201,17
-.byte 102,65,15,58,68,226,16
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
- pxor %xmm6,%xmm0
- pxor %xmm8,%xmm1
- movdqu (%r8),%xmm8
-.byte 102,68,15,56,0,197
- movdqu 16(%r8),%xmm6
-
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
- pxor %xmm8,%xmm1
- pxor %xmm3,%xmm4
-.byte 102,15,56,0,245
- movdqa %xmm4,%xmm3
- psrldq $8,%xmm3
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ movdqu (%r8),%xmm9
+ pxor %xmm0,%xmm8
+.byte 102,69,15,56,0,202
+ movdqu 16(%r8),%xmm3
+
+ pxor %xmm1,%xmm8
+ pxor %xmm9,%xmm1
+ pxor %xmm8,%xmm4
+.byte 102,65,15,56,0,218
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
- movdqa %xmm6,%xmm8
+ movdqa %xmm3,%xmm5
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
+ movdqa %xmm0,%xmm9
+ movdqa %xmm0,%xmm8
psllq $5,%xmm0
-.byte 102,15,58,68,242,0
- pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm8
+.byte 102,15,58,68,218,0
psllq $1,%xmm0
- pxor %xmm3,%xmm0
+ pxor %xmm8,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm3
+ movdqa %xmm0,%xmm8
pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- pshufd $78,%xmm8,%xmm3
- pxor %xmm8,%xmm3
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pshufd $78,%xmm5,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm5,%xmm4
-.byte 102,68,15,58,68,194,17
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm9
psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
+.byte 102,15,58,68,234,17
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
psrlq $5,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm9,%xmm0
+ leaq 32(%r8),%r8
psrlq $1,%xmm0
-.byte 102,65,15,58,68,218,0
+.byte 102,15,58,68,231,0
pxor %xmm1,%xmm0
- leaq 32(%r8),%r8
- subq $32,%r9
+ subq $0x20,%r9
ja .Lmod_loop
.Leven_tail:
movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,65,15,58,68,193,0
-.byte 102,65,15,58,68,201,17
-.byte 102,65,15,58,68,226,16
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
- pxor %xmm6,%xmm0
- pxor %xmm8,%xmm1
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
- pxor %xmm3,%xmm4
- movdqa %xmm4,%xmm3
- psrldq $8,%xmm3
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ pxor %xmm0,%xmm8
+ pxor %xmm1,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
movdqa %xmm0,%xmm4
jnz .Ldone
.Lodd_tail:
- movdqu (%r8),%xmm3
-.byte 102,15,56,0,221
- pxor %xmm3,%xmm0
+ movdqu (%r8),%xmm8
+.byte 102,69,15,56,0,194
+ pxor %xmm8,%xmm0
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,65,15,58,68,218,0
+.byte 102,15,58,68,223,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
psrlq $1,%xmm0
pxor %xmm1,%xmm0
.Ldone:
-.byte 102,15,56,0,197
+.byte 102,65,15,56,0,194
movdqu %xmm0,(%rcx)
movaps (%rsp),%xmm6
movaps 16(%rsp),%xmm7
.def gcm_init_avx; .scl 2; .type 32; .endef
.p2align 5
gcm_init_avx:
- jmp .L_init_clmul
+.LSEH_begin_gcm_init_avx:
+
+.byte 0x48,0x83,0xec,0x18
+.byte 0x0f,0x29,0x34,0x24
+ vzeroupper
+
+ vmovdqu (%rdx),%xmm2
+ vpshufd $78,%xmm2,%xmm2
+
+
+ vpshufd $255,%xmm2,%xmm4
+ vpsrlq $63,%xmm2,%xmm3
+ vpsllq $1,%xmm2,%xmm2
+ vpxor %xmm5,%xmm5,%xmm5
+ vpcmpgtd %xmm4,%xmm5,%xmm5
+ vpslldq $8,%xmm3,%xmm3
+ vpor %xmm3,%xmm2,%xmm2
+
+
+ vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vpunpckhqdq %xmm2,%xmm2,%xmm6
+ vmovdqa %xmm2,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ movq $4,%r10
+ jmp .Linit_start_avx
+.p2align 5
+.Linit_loop_avx:
+ vpalignr $8,%xmm3,%xmm4,%xmm5
+ vmovdqu %xmm5,-16(%rcx)
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+ vmovdqa %xmm0,%xmm5
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+ vpshufd $78,%xmm5,%xmm3
+ vpshufd $78,%xmm0,%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqu %xmm5,0(%rcx)
+ vpxor %xmm0,%xmm4,%xmm4
+ vmovdqu %xmm0,16(%rcx)
+ leaq 48(%rcx),%rcx
+ subq $1,%r10
+ jnz .Linit_loop_avx
+
+ vpalignr $8,%xmm4,%xmm3,%xmm5
+ vmovdqu %xmm5,-16(%rcx)
+
+ vzeroupper
+ movaps (%rsp),%xmm6
+ leaq 24(%rsp),%rsp
+.LSEH_end_gcm_init_avx:
+ .byte 0xf3,0xc3
.globl gcm_gmult_avx
.def gcm_gmult_avx; .scl 2; .type 32; .endef
.def gcm_ghash_avx; .scl 2; .type 32; .endef
.p2align 5
gcm_ghash_avx:
- jmp .L_ghash_clmul
+ leaq -136(%rsp),%rax
+.LSEH_begin_gcm_ghash_avx:
+
+.byte 0x48,0x8d,0x60,0xe0
+.byte 0x0f,0x29,0x70,0xe0
+.byte 0x0f,0x29,0x78,0xf0
+.byte 0x44,0x0f,0x29,0x00
+.byte 0x44,0x0f,0x29,0x48,0x10
+.byte 0x44,0x0f,0x29,0x50,0x20
+.byte 0x44,0x0f,0x29,0x58,0x30
+.byte 0x44,0x0f,0x29,0x60,0x40
+.byte 0x44,0x0f,0x29,0x68,0x50
+.byte 0x44,0x0f,0x29,0x70,0x60
+.byte 0x44,0x0f,0x29,0x78,0x70
+ vzeroupper
+
+ vmovdqu (%rcx),%xmm10
+ leaq .L0x1c2_polynomial(%rip),%r10
+ leaq 64(%rdx),%rdx
+ vmovdqu .Lbswap_mask(%rip),%xmm13
+ vpshufb %xmm13,%xmm10,%xmm10
+ cmpq $0x80,%r9
+ jb .Lshort_avx
+ subq $0x80,%r9
+
+ vmovdqu 112(%r8),%xmm14
+ vmovdqu 0-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vmovdqu 32-64(%rdx),%xmm7
+
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vmovdqu 96(%r8),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm14,%xmm9,%xmm9
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 16-64(%rdx),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 80(%r8),%xmm14
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 48-64(%rdx),%xmm6
+ vpxor %xmm14,%xmm9,%xmm9
+ vmovdqu 64(%r8),%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rdx),%xmm7
+
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rdx),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 48(%r8),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rdx),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 32(%r8),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rdx),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 16(%r8),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rdx),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu (%r8),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rdx),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+
+ leaq 128(%r8),%r8
+ cmpq $0x80,%r9
+ jb .Ltail_avx
+
+ vpxor %xmm10,%xmm15,%xmm15
+ subq $0x80,%r9
+ jmp .Loop8x_avx
+
+.p2align 5
+.Loop8x_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 112(%r8),%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpxor %xmm15,%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
+ vmovdqu 0-64(%rdx),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
+ vmovdqu 32-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 96(%r8),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm3,%xmm10,%xmm10
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vxorps %xmm4,%xmm11,%xmm11
+ vmovdqu 16-64(%rdx),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm5,%xmm12,%xmm12
+ vxorps %xmm15,%xmm8,%xmm8
+
+ vmovdqu 80(%r8),%xmm14
+ vpxor %xmm10,%xmm12,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm11,%xmm12,%xmm12
+ vpslldq $8,%xmm12,%xmm9
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vpsrldq $8,%xmm12,%xmm12
+ vpxor %xmm9,%xmm10,%xmm10
+ vmovdqu 48-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vxorps %xmm12,%xmm11,%xmm11
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 64(%r8),%xmm15
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rdx),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vxorps %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vmovdqu 48(%r8),%xmm14
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rdx),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 32(%r8),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rdx),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+ vxorps %xmm12,%xmm10,%xmm10
+
+ vmovdqu 16(%r8),%xmm14
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rdx),%xmm6
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vxorps %xmm11,%xmm12,%xmm12
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu (%r8),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rdx),%xmm6
+ vpxor %xmm12,%xmm15,%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+ vpxor %xmm10,%xmm15,%xmm15
+
+ leaq 128(%r8),%r8
+ subq $0x80,%r9
+ jnc .Loop8x_avx
+
+ addq $0x80,%r9
+ jmp .Ltail_no_xor_avx
+
+.p2align 5
+.Lshort_avx:
+ vmovdqu -16(%r8,%r9,1),%xmm14
+ leaq (%r8,%r9,1),%r8
+ vmovdqu 0-64(%rdx),%xmm6
+ vmovdqu 32-64(%rdx),%xmm7
+ vpshufb %xmm13,%xmm14,%xmm15
+
+ vmovdqa %xmm0,%xmm3
+ vmovdqa %xmm1,%xmm4
+ vmovdqa %xmm2,%xmm5
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -32(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 16-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -48(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 48-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 80-64(%rdx),%xmm7
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -64(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 64-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -80(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 96-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 128-64(%rdx),%xmm7
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -96(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 112-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -112(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 144-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovq 184-64(%rdx),%xmm7
+ subq $0x10,%r9
+ jmp .Ltail_avx
+
+.p2align 5
+.Ltail_avx:
+ vpxor %xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+
+ vmovdqu (%r10),%xmm12
+
+ vpxor %xmm0,%xmm3,%xmm10
+ vpxor %xmm1,%xmm4,%xmm11
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vpxor %xmm10,%xmm5,%xmm5
+ vpxor %xmm11,%xmm5,%xmm5
+ vpslldq $8,%xmm5,%xmm9
+ vpsrldq $8,%xmm5,%xmm5
+ vpxor %xmm9,%xmm10,%xmm10
+ vpxor %xmm5,%xmm11,%xmm11
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm11,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ cmpq $0,%r9
+ jne .Lshort_avx
+
+ vpshufb %xmm13,%xmm10,%xmm10
+ vmovdqu %xmm10,(%rcx)
+ vzeroupper
+ movaps (%rsp),%xmm6
+ movaps 16(%rsp),%xmm7
+ movaps 32(%rsp),%xmm8
+ movaps 48(%rsp),%xmm9
+ movaps 64(%rsp),%xmm10
+ movaps 80(%rsp),%xmm11
+ movaps 96(%rsp),%xmm12
+ movaps 112(%rsp),%xmm13
+ movaps 128(%rsp),%xmm14
+ movaps 144(%rsp),%xmm15
+ leaq 168(%rsp),%rsp
+.LSEH_end_gcm_ghash_avx:
+ .byte 0xf3,0xc3
.p2align 6
.Lbswap_mask:
movq 40(%r9),%rdi
movq %r8,%rsi
movl $154,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
movq %r9,%rsi
xorq %rcx,%rcx
.rva .LSEH_begin_gcm_ghash_clmul
.rva .LSEH_end_gcm_ghash_clmul
.rva .LSEH_info_gcm_ghash_clmul
+.rva .LSEH_begin_gcm_init_avx
+.rva .LSEH_end_gcm_init_avx
+.rva .LSEH_info_gcm_init_clmul
+
+.rva .LSEH_begin_gcm_ghash_avx
+.rva .LSEH_end_gcm_ghash_avx
+.rva .LSEH_info_gcm_ghash_clmul
.section .xdata
.p2align 3
.LSEH_info_gcm_gmult_4bit:
.byte 9,0,0,0
.rva se_handler
-.rva .Lgmult_prologue,.Lgmult_epilogue
+.rva .Lgmult_prologue,.Lgmult_epilogue
.LSEH_info_gcm_ghash_4bit:
.byte 9,0,0,0
.rva se_handler
-.rva .Lghash_prologue,.Lghash_epilogue
+.rva .Lghash_prologue,.Lghash_epilogue
.LSEH_info_gcm_init_clmul:
.byte 0x01,0x08,0x03,0x00
-.byte 0x08,0x68,0x00,0x00
-.byte 0x04,0x22,0x00,0x00
+.byte 0x08,0x68,0x00,0x00
+.byte 0x04,0x22,0x00,0x00
.LSEH_info_gcm_ghash_clmul:
.byte 0x01,0x33,0x16,0x00
-.byte 0x33,0xf8,0x09,0x00
-.byte 0x2e,0xe8,0x08,0x00
-.byte 0x29,0xd8,0x07,0x00
-.byte 0x24,0xc8,0x06,0x00
-.byte 0x1f,0xb8,0x05,0x00
-.byte 0x1a,0xa8,0x04,0x00
-.byte 0x15,0x98,0x03,0x00
-.byte 0x10,0x88,0x02,0x00
-.byte 0x0c,0x78,0x01,0x00
-.byte 0x08,0x68,0x00,0x00
-.byte 0x04,0x01,0x15,0x00
+.byte 0x33,0xf8,0x09,0x00
+.byte 0x2e,0xe8,0x08,0x00
+.byte 0x29,0xd8,0x07,0x00
+.byte 0x24,0xc8,0x06,0x00
+.byte 0x1f,0xb8,0x05,0x00
+.byte 0x1a,0xa8,0x04,0x00
+.byte 0x15,0x98,0x03,0x00
+.byte 0x10,0x88,0x02,0x00
+.byte 0x0c,0x78,0x01,0x00
+.byte 0x08,0x68,0x00,0x00
+.byte 0x04,0x01,0x15,0x00
ret
.size vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin
-
.section .note.GNU-stack,"",%progbits
-
-
addq $16,%r11
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
subq $1,%rax
pxor %xmm3,%xmm0
pand %xmm9,%xmm0
.byte 102,15,56,0,208
movdqa .Lk_dipt+16(%rip),%xmm0
- xorq $48,%r11
+ xorq $0x30,%r11
leaq .Lk_dsbd(%rip),%r10
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
pxor %xmm5,%xmm2
movdqa .Lk_mc_forward+48(%rip),%xmm5
pxor %xmm2,%xmm0
- call _vpaes_preheat
+ call _vpaes_preheat
movdqa .Lk_rcon(%rip),%xmm8
movdqu (%rdi),%xmm0
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
movdqu %xmm3,(%rdx)
- xorq $48,%r8
+ xorq $0x30,%r8
.Lschedule_go:
cmpl $192,%esi
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
jmp .Loop_schedule_128
.align 16
.Lschedule_192:
movdqu 8(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqa %xmm0,%xmm6
pxor %xmm4,%xmm4
movhlps %xmm4,%xmm6
.Loop_schedule_192:
call _vpaes_schedule_round
.byte 102,15,58,15,198,8
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
jmp .Loop_schedule_192
.align 16
.Lschedule_256:
movdqu 16(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movl $7,%esi
.Loop_schedule_256:
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
movdqa %xmm0,%xmm6
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
movdqa %xmm7,%xmm5
movdqa %xmm6,%xmm7
call _vpaes_schedule_low_round
.Lschedule_mangle_last_dec:
addq $-16,%rdx
pxor .Lk_s63(%rip),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqu %xmm0,(%rdx)
.type _vpaes_schedule_192_smear,@function
.align 16
_vpaes_schedule_192_smear:
- pshufd $128,%xmm6,%xmm1
- pshufd $254,%xmm7,%xmm0
+ pshufd $0x80,%xmm6,%xmm1
+ pshufd $0xFE,%xmm7,%xmm0
pxor %xmm1,%xmm6
pxor %xmm1,%xmm1
pxor %xmm0,%xmm6
pxor %xmm1,%xmm7
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
.byte 102,15,58,15,192,1
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
addq $-16,%r8
- andq $48,%r8
+ andq $0x30,%r8
movdqu %xmm3,(%rdx)
.byte 0xf3,0xc3
.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
movl %eax,240(%rdx)
movl $0,%ecx
- movl $48,%r8d
+ movl $0x30,%r8d
call _vpaes_schedule_core
xorl %eax,%eax
.byte 0xf3,0xc3
.Lk_dsbo:
.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
.align 64
.size _vpaes_consts,.-_vpaes_consts
-
.section .note.GNU-stack,"",%progbits
-
-
--- /dev/null
+# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain copyright notices,
+# this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# * Neither the name of the Andy Polyakov nor the names of its
+# copyright holder and contributors may be used to endorse or
+# promote products derived from this software without specific
+# prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *** This file is auto-generated ***
+#
+.text
+
+.type _aesni_ctr32_ghash_6x,@function
+.align 32
+_aesni_ctr32_ghash_6x:
+ vmovdqu 32(%r11),%xmm2
+ subq $6,%rdx
+ vpxor %xmm4,%xmm4,%xmm4
+ vmovdqu 0-128(%rcx),%xmm15
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovdqu %xmm4,16+8(%rsp)
+ jmp .Loop6x
+
+.align 32
+.Loop6x:
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm15,%xmm10,%xmm10
+ vpxor %xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+ vmovdqu %xmm1,(%r8)
+ vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
+ vpxor %xmm15,%xmm12,%xmm12
+ vmovups 16-128(%rcx),%xmm2
+ vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
+ xorq %r12,%r12
+ cmpq %r14,%r15
+
+ vaesenc %xmm2,%xmm9,%xmm9
+ vmovdqu 48+8(%rsp),%xmm0
+ vpxor %xmm15,%xmm13,%xmm13
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
+ vaesenc %xmm2,%xmm10,%xmm10
+ vpxor %xmm15,%xmm14,%xmm14
+ setnc %r12b
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vmovdqu 16-32(%r9),%xmm3
+ negq %r12
+ vaesenc %xmm2,%xmm12,%xmm12
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
+ vpxor %xmm4,%xmm8,%xmm8
+ vaesenc %xmm2,%xmm13,%xmm13
+ vpxor %xmm5,%xmm1,%xmm4
+ andq $0x60,%r12
+ vmovups 32-128(%rcx),%xmm15
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
+ vaesenc %xmm2,%xmm14,%xmm14
+
+ vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
+ leaq (%r14,%r12,1),%r14
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
+ vmovdqu 64+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 88(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 80(%r14),%r12
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,32+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,40+8(%rsp)
+ vmovdqu 48-32(%r9),%xmm5
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 48-128(%rcx),%xmm15
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
+ vmovdqu 80+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqu 64-32(%r9),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 64-128(%rcx),%xmm15
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 72(%r14),%r13
+ vpxor %xmm5,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 64(%r14),%r12
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
+ vmovdqu 96+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,48+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,56+8(%rsp)
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 96-32(%r9),%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 80-128(%rcx),%xmm15
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 56(%r14),%r13
+ vpxor %xmm1,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
+ vpxor 112+8(%rsp),%xmm8,%xmm8
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 48(%r14),%r12
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,64+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,72+8(%rsp)
+ vpxor %xmm3,%xmm4,%xmm4
+ vmovdqu 112-32(%r9),%xmm3
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 96-128(%rcx),%xmm15
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 40(%r14),%r13
+ vpxor %xmm2,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 32(%r14),%r12
+ vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,80+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,88+8(%rsp)
+ vpxor %xmm5,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor %xmm1,%xmm6,%xmm6
+
+ vmovups 112-128(%rcx),%xmm15
+ vpslldq $8,%xmm6,%xmm5
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 16(%r11),%xmm3
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm8,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm5,%xmm4,%xmm4
+ movbeq 24(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 16(%r14),%r12
+ vpalignr $8,%xmm4,%xmm4,%xmm0
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ movq %r13,96+8(%rsp)
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r12,104+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ vmovups 128-128(%rcx),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 144-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm10,%xmm10
+ vpsrldq $8,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm11,%xmm11
+ vpxor %xmm6,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm12,%xmm12
+ vpxor %xmm0,%xmm4,%xmm4
+ movbeq 8(%r14),%r13
+ vaesenc %xmm1,%xmm13,%xmm13
+ movbeq 0(%r14),%r12
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 160-128(%rcx),%xmm1
+ cmpl $11,%ebp
+ jb .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 176-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 192-128(%rcx),%xmm1
+ je .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 208-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 224-128(%rcx),%xmm1
+ jmp .Lenc_tail
+
+.align 32
+.Lhandle_ctr32:
+ vmovdqu (%r11),%xmm0
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm15,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm15,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpshufb %xmm0,%xmm1,%xmm1
+ jmp .Lresume_ctr32
+
+.align 32
+.Lenc_tail:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vmovdqu %xmm7,16+8(%rsp)
+ vpalignr $8,%xmm4,%xmm4,%xmm8
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ vpxor 0(%rdi),%xmm1,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 16(%rdi),%xmm1,%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 32(%rdi),%xmm1,%xmm5
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 48(%rdi),%xmm1,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 64(%rdi),%xmm1,%xmm7
+ vpxor 80(%rdi),%xmm1,%xmm3
+ vmovdqu (%r8),%xmm1
+
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vmovdqu 32(%r11),%xmm2
+ vaesenclast %xmm0,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm1,%xmm0
+ movq %r13,112+8(%rsp)
+ leaq 96(%rdi),%rdi
+ vaesenclast %xmm5,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm0,%xmm5
+ movq %r12,120+8(%rsp)
+ leaq 96(%rsi),%rsi
+ vmovdqu 0-128(%rcx),%xmm15
+ vaesenclast %xmm6,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm5,%xmm6
+ vaesenclast %xmm7,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm6,%xmm7
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vpaddb %xmm2,%xmm7,%xmm3
+
+ addq $0x60,%r10
+ subq $0x6,%rdx
+ jc .L6x_done
+
+ vmovups %xmm9,-96(%rsi)
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovups %xmm10,-80(%rsi)
+ vmovdqa %xmm0,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vmovdqa %xmm5,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vmovdqa %xmm6,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vmovdqa %xmm7,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vmovdqa %xmm3,%xmm14
+ vmovdqu 32+8(%rsp),%xmm7
+ jmp .Loop6x
+
+.L6x_done:
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpxor %xmm4,%xmm8,%xmm8
+
+ .byte 0xf3,0xc3
+.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+.globl aesni_gcm_decrypt
+.type aesni_gcm_decrypt,@function
+.align 32
+aesni_gcm_decrypt:
+ xorq %r10,%r10
+ cmpq $0x60,%rdx
+ jb .Lgcm_dec_abort
+
+ leaq (%rsp),%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ vmovdqu (%r9),%xmm8
+ andq $-128,%rsp
+ vmovdqu (%r11),%xmm0
+ leaq 128(%rcx),%rcx
+ leaq 32+32(%r9),%r9
+ movl 240-128(%rcx),%ebp
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Ldec_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Ldec_no_key_aliasing
+ subq %r15,%rsp
+.Ldec_no_key_aliasing:
+
+ vmovdqu 80(%rdi),%xmm7
+ leaq (%rdi),%r14
+ vmovdqu 64(%rdi),%xmm4
+ leaq -192(%rdi,%rdx,1),%r15
+ vmovdqu 48(%rdi),%xmm5
+ shrq $4,%rdx
+ xorq %r10,%r10
+ vmovdqu 32(%rdi),%xmm6
+ vpshufb %xmm0,%xmm7,%xmm7
+ vmovdqu 16(%rdi),%xmm2
+ vpshufb %xmm0,%xmm4,%xmm4
+ vmovdqu (%rdi),%xmm3
+ vpshufb %xmm0,%xmm5,%xmm5
+ vmovdqu %xmm4,48(%rsp)
+ vpshufb %xmm0,%xmm6,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm2,%xmm2
+ vmovdqu %xmm6,80(%rsp)
+ vpshufb %xmm0,%xmm3,%xmm3
+ vmovdqu %xmm2,96(%rsp)
+ vmovdqu %xmm3,112(%rsp)
+
+ call _aesni_ctr32_ghash_6x
+
+ vmovups %xmm9,-96(%rsi)
+ vmovups %xmm10,-80(%rsi)
+ vmovups %xmm11,-64(%rsi)
+ vmovups %xmm12,-48(%rsi)
+ vmovups %xmm13,-32(%rsi)
+ vmovups %xmm14,-16(%rsi)
+
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+.Lgcm_dec_abort:
+ movq %r10,%rax
+ .byte 0xf3,0xc3
+.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.type _aesni_ctr32_6x,@function
+.align 32
+_aesni_ctr32_6x:
+ vmovdqu 0-128(%rcx),%xmm4
+ vmovdqu 32(%r11),%xmm2
+ leaq -1(%rbp),%r13
+ vmovups 16-128(%rcx),%xmm15
+ leaq 32-128(%rcx),%r12
+ vpxor %xmm4,%xmm1,%xmm9
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32_2
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+
+.align 16
+.Loop_ctr32:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+ vmovups (%r12),%xmm15
+ leaq 16(%r12),%r12
+ decl %r13d
+ jnz .Loop_ctr32
+
+ vmovdqu (%r12),%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 0(%rdi),%xmm3,%xmm4
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor 16(%rdi),%xmm3,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 32(%rdi),%xmm3,%xmm6
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 48(%rdi),%xmm3,%xmm8
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 64(%rdi),%xmm3,%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 80(%rdi),%xmm3,%xmm3
+ leaq 96(%rdi),%rdi
+
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm5,%xmm10,%xmm10
+ vaesenclast %xmm6,%xmm11,%xmm11
+ vaesenclast %xmm8,%xmm12,%xmm12
+ vaesenclast %xmm2,%xmm13,%xmm13
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vmovups %xmm9,0(%rsi)
+ vmovups %xmm10,16(%rsi)
+ vmovups %xmm11,32(%rsi)
+ vmovups %xmm12,48(%rsi)
+ vmovups %xmm13,64(%rsi)
+ vmovups %xmm14,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ .byte 0xf3,0xc3
+.align 32
+.Lhandle_ctr32_2:
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl aesni_gcm_encrypt
+.type aesni_gcm_encrypt,@function
+.align 32
+aesni_gcm_encrypt:
+ xorq %r10,%r10
+ cmpq $288,%rdx
+ jb .Lgcm_enc_abort
+
+ leaq (%rsp),%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ leaq 128(%rcx),%rcx
+ vmovdqu (%r11),%xmm0
+ andq $-128,%rsp
+ movl 240-128(%rcx),%ebp
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Lenc_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Lenc_no_key_aliasing
+ subq %r15,%rsp
+.Lenc_no_key_aliasing:
+
+ leaq (%rsi),%r14
+ leaq -192(%rsi,%rdx,1),%r15
+ shrq $4,%rdx
+
+ call _aesni_ctr32_6x
+ vpshufb %xmm0,%xmm9,%xmm8
+ vpshufb %xmm0,%xmm10,%xmm2
+ vmovdqu %xmm8,112(%rsp)
+ vpshufb %xmm0,%xmm11,%xmm4
+ vmovdqu %xmm2,96(%rsp)
+ vpshufb %xmm0,%xmm12,%xmm5
+ vmovdqu %xmm4,80(%rsp)
+ vpshufb %xmm0,%xmm13,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm14,%xmm7
+ vmovdqu %xmm6,48(%rsp)
+
+ call _aesni_ctr32_6x
+
+ vmovdqu (%r9),%xmm8
+ leaq 32+32(%r9),%r9
+ subq $12,%rdx
+ movq $192,%r10
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ call _aesni_ctr32_ghash_6x
+ vmovdqu 32(%rsp),%xmm7
+ vmovdqu (%r11),%xmm0
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm7,%xmm7,%xmm1
+ vmovdqu 32-32(%r9),%xmm15
+ vmovups %xmm9,-96(%rsi)
+ vpshufb %xmm0,%xmm9,%xmm9
+ vpxor %xmm7,%xmm1,%xmm1
+ vmovups %xmm10,-80(%rsi)
+ vpshufb %xmm0,%xmm10,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vpshufb %xmm0,%xmm11,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vpshufb %xmm0,%xmm12,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vpshufb %xmm0,%xmm13,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vpshufb %xmm0,%xmm14,%xmm14
+ vmovdqu %xmm9,16(%rsp)
+ vmovdqu 48(%rsp),%xmm6
+ vmovdqu 16-32(%r9),%xmm0
+ vpunpckhqdq %xmm6,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
+ vpxor %xmm6,%xmm2,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+
+ vmovdqu 64(%rsp),%xmm9
+ vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm9,%xmm9,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
+ vpxor %xmm9,%xmm5,%xmm5
+ vpxor %xmm7,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vmovdqu 80(%rsp),%xmm1
+ vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm4,%xmm7,%xmm7
+ vpunpckhqdq %xmm1,%xmm1,%xmm4
+ vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpxor %xmm6,%xmm9,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 96(%rsp),%xmm2
+ vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm7,%xmm6,%xmm6
+ vpunpckhqdq %xmm2,%xmm2,%xmm7
+ vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpxor %xmm9,%xmm1,%xmm1
+ vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm5,%xmm4,%xmm4
+
+ vpxor 112(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
+ vmovdqu 112-32(%r9),%xmm0
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm1,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
+ vpxor %xmm4,%xmm7,%xmm4
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm1
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
+ vpxor %xmm14,%xmm1,%xmm1
+ vpxor %xmm5,%xmm6,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
+ vmovdqu 32-32(%r9),%xmm15
+ vpxor %xmm2,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm6
+
+ vmovdqu 16-32(%r9),%xmm0
+ vpxor %xmm5,%xmm7,%xmm9
+ vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
+ vpxor %xmm9,%xmm6,%xmm6
+ vpunpckhqdq %xmm13,%xmm13,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
+ vpxor %xmm13,%xmm2,%xmm2
+ vpslldq $8,%xmm6,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+ vpxor %xmm9,%xmm5,%xmm8
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm12,%xmm12,%xmm9
+ vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
+ vpxor %xmm12,%xmm9,%xmm9
+ vpxor %xmm14,%xmm13,%xmm13
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm11,%xmm11,%xmm1
+ vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
+ vpxor %xmm11,%xmm1,%xmm1
+ vpxor %xmm13,%xmm12,%xmm12
+ vxorps 16(%rsp),%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm9,%xmm9
+
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm10,%xmm10,%xmm2
+ vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
+ vpxor %xmm10,%xmm2,%xmm2
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpxor %xmm12,%xmm11,%xmm11
+ vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vxorps %xmm7,%xmm14,%xmm14
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
+ vmovdqu 112-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm11,%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
+ vpxor %xmm4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
+ vpxor %xmm10,%xmm7,%xmm7
+ vpxor %xmm2,%xmm6,%xmm6
+
+ vpxor %xmm5,%xmm7,%xmm4
+ vpxor %xmm4,%xmm6,%xmm6
+ vpslldq $8,%xmm6,%xmm1
+ vmovdqu 16(%r11),%xmm3
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm1,%xmm5,%xmm8
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm2,%xmm8,%xmm8
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm7,%xmm2,%xmm2
+ vpxor %xmm2,%xmm8,%xmm8
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+.Lgcm_enc_abort:
+ movq %r10,%rax
+ .byte 0xf3,0xc3
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+.align 64
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
+
+.section .note.GNU-stack,"",%progbits
leal 16(%edx),%edx
jnz .L000enc1_loop_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.size aesni_encrypt,.-.L_aesni_encrypt_begin
.globl aesni_decrypt
leal 16(%edx),%edx
jnz .L001dec1_loop_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.size aesni_decrypt,.-.L_aesni_decrypt_begin
+.type _aesni_encrypt2,@function
+.align 16
+_aesni_encrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L002enc2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L002enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ ret
+.size _aesni_encrypt2,.-_aesni_encrypt2
+.type _aesni_decrypt2,@function
+.align 16
+_aesni_decrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L003dec2_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L003dec2_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+ ret
+.size _aesni_decrypt2,.-_aesni_decrypt2
.type _aesni_encrypt3,@function
.align 16
_aesni_encrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-.L002enc3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L004enc3_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
- movups (%edx),%xmm0
- jnz .L002enc3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L004enc3_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.align 16
_aesni_decrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-.L003dec3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L005dec3_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
- movups (%edx),%xmm0
- jnz .L003dec3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L005dec3_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
_aesni_encrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-.L004enc4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+.L006enc4_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%edx),%xmm0
- jnz .L004enc4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L006enc4_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
_aesni_decrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-.L005dec4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+.L007dec4_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%edx),%xmm0
- jnz .L005dec4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L007dec4_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.align 16
_aesni_encrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
-.byte 102,15,56,220,241
- movups (%edx),%xmm0
-.byte 102,15,56,220,249
- jmp .L_aesni_encrypt6_enter
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp .L008_aesni_encrypt6_inner
.align 16
-.L006enc6_loop:
+.L009enc6_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
+.L008_aesni_encrypt6_inner:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.align 16
.L_aesni_encrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%edx),%xmm0
- jnz .L006enc6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L009enc6_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.align 16
_aesni_decrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
-.byte 102,15,56,222,241
- movups (%edx),%xmm0
-.byte 102,15,56,222,249
- jmp .L_aesni_decrypt6_enter
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp .L010_aesni_decrypt6_inner
.align 16
-.L007dec6_loop:
+.L011dec6_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
+.L010_aesni_decrypt6_inner:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.align 16
.L_aesni_decrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%edx),%xmm0
- jnz .L007dec6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L011dec6_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
movl 32(%esp),%edx
movl 36(%esp),%ebx
andl $-16,%eax
- jz .L008ecb_ret
+ jz .L012ecb_ret
movl 240(%edx),%ecx
testl %ebx,%ebx
- jz .L009ecb_decrypt
+ jz .L013ecb_decrypt
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L010ecb_enc_tail
+ jb .L014ecb_enc_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L011ecb_enc_loop6_enter
+ jmp .L015ecb_enc_loop6_enter
.align 16
-.L012ecb_enc_loop6:
+.L016ecb_enc_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L011ecb_enc_loop6_enter:
+.L015ecb_enc_loop6_enter:
call _aesni_encrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L012ecb_enc_loop6
+ jnc .L016ecb_enc_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L008ecb_ret
-.L010ecb_enc_tail:
+ jz .L012ecb_ret
+.L014ecb_enc_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L013ecb_enc_one
+ jb .L017ecb_enc_one
movups 16(%esi),%xmm3
- je .L014ecb_enc_two
+ je .L018ecb_enc_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L015ecb_enc_three
+ jb .L019ecb_enc_three
movups 48(%esi),%xmm5
- je .L016ecb_enc_four
+ je .L020ecb_enc_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call _aesni_encrypt6
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L013ecb_enc_one:
+.L017ecb_enc_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L017enc1_loop_3:
+.L021enc1_loop_3:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L017enc1_loop_3
+ jnz .L021enc1_loop_3
.byte 102,15,56,221,209
movups %xmm2,(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L014ecb_enc_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
+.L018ecb_enc_two:
+ call _aesni_encrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L015ecb_enc_three:
+.L019ecb_enc_three:
call _aesni_encrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L016ecb_enc_four:
+.L020ecb_enc_four:
call _aesni_encrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L009ecb_decrypt:
+.L013ecb_decrypt:
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L018ecb_dec_tail
+ jb .L022ecb_dec_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L019ecb_dec_loop6_enter
+ jmp .L023ecb_dec_loop6_enter
.align 16
-.L020ecb_dec_loop6:
+.L024ecb_dec_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L019ecb_dec_loop6_enter:
+.L023ecb_dec_loop6_enter:
call _aesni_decrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L020ecb_dec_loop6
+ jnc .L024ecb_dec_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L008ecb_ret
-.L018ecb_dec_tail:
+ jz .L012ecb_ret
+.L022ecb_dec_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L021ecb_dec_one
+ jb .L025ecb_dec_one
movups 16(%esi),%xmm3
- je .L022ecb_dec_two
+ je .L026ecb_dec_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L023ecb_dec_three
+ jb .L027ecb_dec_three
movups 48(%esi),%xmm5
- je .L024ecb_dec_four
+ je .L028ecb_dec_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call _aesni_decrypt6
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L021ecb_dec_one:
+.L025ecb_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L025dec1_loop_4:
+.L029dec1_loop_4:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L025dec1_loop_4
+ jnz .L029dec1_loop_4
.byte 102,15,56,223,209
movups %xmm2,(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L022ecb_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+.L026ecb_dec_two:
+ call _aesni_decrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L023ecb_dec_three:
+.L027ecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L024ecb_dec_four:
+.L028ecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L008ecb_ret:
+.L012ecb_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
movl %ebp,20(%esp)
movl %ebp,24(%esp)
movl %ebp,28(%esp)
- shrl $1,%ecx
+ shll $4,%ecx
+ movl $16,%ebx
leal (%edx),%ebp
movdqa (%esp),%xmm5
movdqa %xmm7,%xmm2
- movl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ subl %ecx,%ebx
.byte 102,15,56,0,253
-.L026ccm64_enc_outer:
+.L030ccm64_enc_outer:
movups (%ebp),%xmm0
movl %ebx,%ecx
movups (%esi),%xmm6
xorps %xmm0,%xmm2
movups 16(%ebp),%xmm1
xorps %xmm6,%xmm0
- leal 32(%ebp),%edx
xorps %xmm0,%xmm3
- movups (%edx),%xmm0
-.L027ccm64_enc2_loop:
+ movups 32(%ebp),%xmm0
+.L031ccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz .L027ccm64_enc2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L031ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
paddq 16(%esp),%xmm7
+ decl %eax
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decl %eax
leal 16(%esi),%esi
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
movups %xmm6,(%edi)
- leal 16(%edi),%edi
.byte 102,15,56,0,213
- jnz .L026ccm64_enc_outer
+ leal 16(%edi),%edi
+ jnz .L030ccm64_enc_outer
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L028enc1_loop_5:
+.L032enc1_loop_5:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L028enc1_loop_5
+ jnz .L032enc1_loop_5
.byte 102,15,56,221,209
+ shll $4,%ebx
+ movl $16,%ecx
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
leal 16(%esi),%esi
- jmp .L029ccm64_dec_outer
+ subl %ebx,%ecx
+ leal 32(%ebp,%ebx,1),%edx
+ movl %ecx,%ebx
+ jmp .L033ccm64_dec_outer
.align 16
-.L029ccm64_dec_outer:
+.L033ccm64_dec_outer:
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
- movl %ebx,%ecx
movups %xmm6,(%edi)
leal 16(%edi),%edi
.byte 102,15,56,0,213
subl $1,%eax
- jz .L030ccm64_dec_break
+ jz .L034ccm64_dec_break
movups (%ebp),%xmm0
- shrl $1,%ecx
+ movl %ebx,%ecx
movups 16(%ebp),%xmm1
xorps %xmm0,%xmm6
- leal 32(%ebp),%edx
xorps %xmm0,%xmm2
xorps %xmm6,%xmm3
- movups (%edx),%xmm0
-.L031ccm64_dec2_loop:
+ movups 32(%ebp),%xmm0
+.L035ccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz .L031ccm64_dec2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L035ccm64_dec2_loop
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leal 16(%esi),%esi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- jmp .L029ccm64_dec_outer
+ leal 16(%esi),%esi
+ jmp .L033ccm64_dec_outer
.align 16
-.L030ccm64_dec_break:
+.L034ccm64_dec_break:
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movups (%edx),%xmm0
movups 16(%edx),%xmm1
xorps %xmm0,%xmm6
leal 32(%edx),%edx
xorps %xmm6,%xmm3
-.L032enc1_loop_6:
+.L036enc1_loop_6:
.byte 102,15,56,220,217
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L032enc1_loop_6
+ jnz .L036enc1_loop_6
.byte 102,15,56,221,217
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
andl $-16,%esp
movl %ebp,80(%esp)
cmpl $1,%eax
- je .L033ctr32_one_shortcut
+ je .L037ctr32_one_shortcut
movdqu (%ebx),%xmm7
movl $202182159,(%esp)
movl $134810123,4(%esp)
.byte 102,15,58,34,253,3
movl 240(%edx),%ecx
bswap %ebx
- pxor %xmm1,%xmm1
pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movdqa (%esp),%xmm2
-.byte 102,15,58,34,203,0
+.byte 102,15,58,34,195,0
leal 3(%ebx),%ebp
-.byte 102,15,58,34,197,0
+.byte 102,15,58,34,205,0
incl %ebx
-.byte 102,15,58,34,203,1
+.byte 102,15,58,34,195,1
incl %ebp
-.byte 102,15,58,34,197,1
+.byte 102,15,58,34,205,1
incl %ebx
-.byte 102,15,58,34,203,2
+.byte 102,15,58,34,195,2
incl %ebp
-.byte 102,15,58,34,197,2
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
- movdqa %xmm0,64(%esp)
+.byte 102,15,58,34,205,2
+ movdqa %xmm0,48(%esp)
.byte 102,15,56,0,194
- pshufd $192,%xmm1,%xmm2
- pshufd $128,%xmm1,%xmm3
+ movdqu (%edx),%xmm6
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
+ pshufd $192,%xmm0,%xmm2
+ pshufd $128,%xmm0,%xmm3
cmpl $6,%eax
- jb .L034ctr32_tail
+ jb .L038ctr32_tail
+ pxor %xmm6,%xmm7
+ shll $4,%ecx
+ movl $16,%ebx
movdqa %xmm7,32(%esp)
- shrl $1,%ecx
movl %edx,%ebp
- movl %ecx,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
subl $6,%eax
- jmp .L035ctr32_loop6
-.align 16
-.L035ctr32_loop6:
- pshufd $64,%xmm1,%xmm4
- movdqa 32(%esp),%xmm1
- pshufd $192,%xmm0,%xmm5
- por %xmm1,%xmm2
- pshufd $128,%xmm0,%xmm6
- por %xmm1,%xmm3
- pshufd $64,%xmm0,%xmm7
- por %xmm1,%xmm4
- por %xmm1,%xmm5
- por %xmm1,%xmm6
- por %xmm1,%xmm7
- movups (%ebp),%xmm0
- movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
- decl %ecx
+ jmp .L039ctr32_loop6
+.align 16
+.L039ctr32_loop6:
+ pshufd $64,%xmm0,%xmm4
+ movdqa 32(%esp),%xmm0
+ pshufd $192,%xmm1,%xmm5
pxor %xmm0,%xmm2
+ pshufd $128,%xmm1,%xmm6
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
+ pshufd $64,%xmm1,%xmm7
+ movups 16(%ebp),%xmm1
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
+.byte 102,15,56,220,209
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
pxor %xmm0,%xmm7
+.byte 102,15,56,220,217
+ movups 32(%ebp),%xmm0
+ movl %ebx,%ecx
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call .L_aesni_encrypt6_enter
movups (%esi),%xmm1
movups %xmm2,(%edi)
movdqa 16(%esp),%xmm0
xorps %xmm1,%xmm4
- movdqa 48(%esp),%xmm1
+ movdqa 64(%esp),%xmm1
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
paddd %xmm0,%xmm1
- paddd 64(%esp),%xmm0
+ paddd 48(%esp),%xmm0
movdqa (%esp),%xmm2
movups 48(%esi),%xmm3
movups 64(%esi),%xmm4
xorps %xmm3,%xmm5
movups 80(%esi),%xmm3
leal 96(%esi),%esi
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
+ movdqa %xmm0,48(%esp)
+.byte 102,15,56,0,194
xorps %xmm4,%xmm6
movups %xmm5,48(%edi)
xorps %xmm3,%xmm7
- movdqa %xmm0,64(%esp)
-.byte 102,15,56,0,194
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
movups %xmm6,64(%edi)
- pshufd $192,%xmm1,%xmm2
+ pshufd $192,%xmm0,%xmm2
movups %xmm7,80(%edi)
leal 96(%edi),%edi
- movl %ebx,%ecx
- pshufd $128,%xmm1,%xmm3
+ pshufd $128,%xmm0,%xmm3
subl $6,%eax
- jnc .L035ctr32_loop6
+ jnc .L039ctr32_loop6
addl $6,%eax
- jz .L036ctr32_ret
+ jz .L040ctr32_ret
+ movdqu (%ebp),%xmm7
movl %ebp,%edx
- leal 1(,%ecx,2),%ecx
- movdqa 32(%esp),%xmm7
-.L034ctr32_tail:
+ pxor 32(%esp),%xmm7
+ movl 240(%ebp),%ecx
+.L038ctr32_tail:
por %xmm7,%xmm2
cmpl $2,%eax
- jb .L037ctr32_one
- pshufd $64,%xmm1,%xmm4
+ jb .L041ctr32_one
+ pshufd $64,%xmm0,%xmm4
por %xmm7,%xmm3
- je .L038ctr32_two
- pshufd $192,%xmm0,%xmm5
+ je .L042ctr32_two
+ pshufd $192,%xmm1,%xmm5
por %xmm7,%xmm4
cmpl $4,%eax
- jb .L039ctr32_three
- pshufd $128,%xmm0,%xmm6
+ jb .L043ctr32_three
+ pshufd $128,%xmm1,%xmm6
por %xmm7,%xmm5
- je .L040ctr32_four
+ je .L044ctr32_four
por %xmm7,%xmm6
call _aesni_encrypt6
movups (%esi),%xmm1
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L036ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L033ctr32_one_shortcut:
+.L037ctr32_one_shortcut:
movups (%ebx),%xmm2
movl 240(%edx),%ecx
-.L037ctr32_one:
+.L041ctr32_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L041enc1_loop_7:
+.L045enc1_loop_7:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L041enc1_loop_7
+ jnz .L045enc1_loop_7
.byte 102,15,56,221,209
movups (%esi),%xmm6
xorps %xmm2,%xmm6
movups %xmm6,(%edi)
- jmp .L036ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L038ctr32_two:
- call _aesni_encrypt3
+.L042ctr32_two:
+ call _aesni_encrypt2
movups (%esi),%xmm5
movups 16(%esi),%xmm6
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L036ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L039ctr32_three:
+.L043ctr32_three:
call _aesni_encrypt3
movups (%esi),%xmm5
movups 16(%esi),%xmm6
xorps %xmm7,%xmm4
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L036ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L040ctr32_four:
+.L044ctr32_four:
call _aesni_encrypt4
movups (%esi),%xmm6
movups 16(%esi),%xmm7
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L036ctr32_ret:
+.L040ctr32_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
movl 80(%esp),%esp
popl %edi
popl %esi
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L042enc1_loop_8:
+.L046enc1_loop_8:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L042enc1_loop_8
+ jnz .L046enc1_loop_8
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
movl %edx,%ebp
movl %ecx,%ebx
subl $96,%eax
- jc .L043xts_enc_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp .L044xts_enc_loop6
+ jc .L047xts_enc_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp .L048xts_enc_loop6
.align 16
-.L044xts_enc_loop6:
+.L048xts_enc_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,220,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call .L_aesni_encrypt6_enter
movdqa 80(%esp),%xmm1
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L044xts_enc_loop6
- leal 1(,%ecx,2),%ecx
+ jnc .L048xts_enc_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L043xts_enc_short:
+.L047xts_enc_short:
addl $96,%eax
- jz .L045xts_enc_done6x
+ jz .L049xts_enc_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L046xts_enc_one
+ jb .L050xts_enc_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L047xts_enc_two
+ je .L051xts_enc_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L048xts_enc_three
+ jb .L052xts_enc_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L049xts_enc_four
+ je .L053xts_enc_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L046xts_enc_one:
+.L050xts_enc_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L051enc1_loop_9:
+.L055enc1_loop_9:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L051enc1_loop_9
+ jnz .L055enc1_loop_9
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L047xts_enc_two:
+.L051xts_enc_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
+ call _aesni_encrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L048xts_enc_three:
+.L052xts_enc_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L049xts_enc_four:
+.L053xts_enc_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L045xts_enc_done6x:
+.L049xts_enc_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L052xts_enc_ret
+ jz .L056xts_enc_ret
movdqa %xmm1,%xmm5
movl %eax,112(%esp)
- jmp .L053xts_enc_steal
+ jmp .L057xts_enc_steal
.align 16
-.L050xts_enc_done:
+.L054xts_enc_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L052xts_enc_ret
+ jz .L056xts_enc_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm5
paddq %xmm1,%xmm1
pand 96(%esp),%xmm5
pxor %xmm1,%xmm5
-.L053xts_enc_steal:
+.L057xts_enc_steal:
movzbl (%esi),%ecx
movzbl -16(%edi),%edx
leal 1(%esi),%esi
movb %dl,(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L053xts_enc_steal
+ jnz .L057xts_enc_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L054enc1_loop_10:
+.L058enc1_loop_10:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L054enc1_loop_10
+ jnz .L058enc1_loop_10
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,-16(%edi)
-.L052xts_enc_ret:
+.L056xts_enc_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L055enc1_loop_11:
+.L059enc1_loop_11:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L055enc1_loop_11
+ jnz .L059enc1_loop_11
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
pcmpgtd %xmm1,%xmm0
andl $-16,%eax
subl $96,%eax
- jc .L056xts_dec_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp .L057xts_dec_loop6
+ jc .L060xts_dec_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp .L061xts_dec_loop6
.align 16
-.L057xts_dec_loop6:
+.L061xts_dec_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,222,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
.byte 102,15,56,222,241
- movups (%edx),%xmm0
.byte 102,15,56,222,249
call .L_aesni_decrypt6_enter
movdqa 80(%esp),%xmm1
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L057xts_dec_loop6
- leal 1(,%ecx,2),%ecx
+ jnc .L061xts_dec_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L056xts_dec_short:
+.L060xts_dec_short:
addl $96,%eax
- jz .L058xts_dec_done6x
+ jz .L062xts_dec_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L059xts_dec_one
+ jb .L063xts_dec_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L060xts_dec_two
+ je .L064xts_dec_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L061xts_dec_three
+ jb .L065xts_dec_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L062xts_dec_four
+ je .L066xts_dec_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L059xts_dec_one:
+.L063xts_dec_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L064dec1_loop_12:
+.L068dec1_loop_12:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L064dec1_loop_12
+ jnz .L068dec1_loop_12
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L060xts_dec_two:
+.L064xts_dec_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- call _aesni_decrypt3
+ call _aesni_decrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L061xts_dec_three:
+.L065xts_dec_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L062xts_dec_four:
+.L066xts_dec_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L058xts_dec_done6x:
+.L062xts_dec_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L065xts_dec_ret
+ jz .L069xts_dec_ret
movl %eax,112(%esp)
- jmp .L066xts_dec_only_one_more
+ jmp .L070xts_dec_only_one_more
.align 16
-.L063xts_dec_done:
+.L067xts_dec_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L065xts_dec_ret
+ jz .L069xts_dec_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm2
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
-.L066xts_dec_only_one_more:
+.L070xts_dec_only_one_more:
pshufd $19,%xmm0,%xmm5
movdqa %xmm1,%xmm6
paddq %xmm1,%xmm1
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L067dec1_loop_13:
+.L071dec1_loop_13:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L067dec1_loop_13
+ jnz .L071dec1_loop_13
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
-.L068xts_dec_steal:
+.L072xts_dec_steal:
movzbl 16(%esi),%ecx
movzbl (%edi),%edx
leal 1(%esi),%esi
movb %dl,16(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L068xts_dec_steal
+ jnz .L072xts_dec_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L069dec1_loop_14:
+.L073dec1_loop_14:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L069dec1_loop_14
+ jnz .L073dec1_loop_14
.byte 102,15,56,223,209
xorps %xmm6,%xmm2
movups %xmm2,(%edi)
-.L065xts_dec_ret:
+.L069xts_dec_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
movl 32(%esp),%edx
movl 36(%esp),%ebp
testl %eax,%eax
- jz .L070cbc_abort
+ jz .L074cbc_abort
cmpl $0,40(%esp)
xchgl %esp,%ebx
movups (%ebp),%xmm7
movl %edx,%ebp
movl %ebx,16(%esp)
movl %ecx,%ebx
- je .L071cbc_decrypt
+ je .L075cbc_decrypt
movaps %xmm7,%xmm2
cmpl $16,%eax
- jb .L072cbc_enc_tail
+ jb .L076cbc_enc_tail
subl $16,%eax
- jmp .L073cbc_enc_loop
+ jmp .L077cbc_enc_loop
.align 16
-.L073cbc_enc_loop:
+.L077cbc_enc_loop:
movups (%esi),%xmm7
leal 16(%esi),%esi
movups (%edx),%xmm0
xorps %xmm0,%xmm7
leal 32(%edx),%edx
xorps %xmm7,%xmm2
-.L074enc1_loop_15:
+.L078enc1_loop_15:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L074enc1_loop_15
+ jnz .L078enc1_loop_15
.byte 102,15,56,221,209
movl %ebx,%ecx
movl %ebp,%edx
movups %xmm2,(%edi)
leal 16(%edi),%edi
subl $16,%eax
- jnc .L073cbc_enc_loop
+ jnc .L077cbc_enc_loop
addl $16,%eax
- jnz .L072cbc_enc_tail
+ jnz .L076cbc_enc_tail
movaps %xmm2,%xmm7
- jmp .L075cbc_ret
-.L072cbc_enc_tail:
+ pxor %xmm2,%xmm2
+ jmp .L079cbc_ret
+.L076cbc_enc_tail:
movl %eax,%ecx
.long 2767451785
movl $16,%ecx
movl %ebx,%ecx
movl %edi,%esi
movl %ebp,%edx
- jmp .L073cbc_enc_loop
+ jmp .L077cbc_enc_loop
.align 16
-.L071cbc_decrypt:
+.L075cbc_decrypt:
cmpl $80,%eax
- jbe .L076cbc_dec_tail
+ jbe .L080cbc_dec_tail
movaps %xmm7,(%esp)
subl $80,%eax
- jmp .L077cbc_dec_loop6_enter
+ jmp .L081cbc_dec_loop6_enter
.align 16
-.L078cbc_dec_loop6:
+.L082cbc_dec_loop6:
movaps %xmm0,(%esp)
movups %xmm7,(%edi)
leal 16(%edi),%edi
-.L077cbc_dec_loop6_enter:
+.L081cbc_dec_loop6_enter:
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
movups %xmm6,64(%edi)
leal 80(%edi),%edi
subl $96,%eax
- ja .L078cbc_dec_loop6
+ ja .L082cbc_dec_loop6
movaps %xmm7,%xmm2
movaps %xmm0,%xmm7
addl $80,%eax
- jle .L079cbc_dec_tail_collected
+ jle .L083cbc_dec_clear_tail_collected
movups %xmm2,(%edi)
leal 16(%edi),%edi
-.L076cbc_dec_tail:
+.L080cbc_dec_tail:
movups (%esi),%xmm2
movaps %xmm2,%xmm6
cmpl $16,%eax
- jbe .L080cbc_dec_one
+ jbe .L084cbc_dec_one
movups 16(%esi),%xmm3
movaps %xmm3,%xmm5
cmpl $32,%eax
- jbe .L081cbc_dec_two
+ jbe .L085cbc_dec_two
movups 32(%esi),%xmm4
cmpl $48,%eax
- jbe .L082cbc_dec_three
+ jbe .L086cbc_dec_three
movups 48(%esi),%xmm5
cmpl $64,%eax
- jbe .L083cbc_dec_four
+ jbe .L087cbc_dec_four
movups 64(%esi),%xmm6
movaps %xmm7,(%esp)
movups (%esi),%xmm2
xorps %xmm0,%xmm6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%edi)
+ pxor %xmm5,%xmm5
leal 64(%edi),%edi
movaps %xmm6,%xmm2
+ pxor %xmm6,%xmm6
subl $80,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L080cbc_dec_one:
+.L084cbc_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L084dec1_loop_16:
+.L089dec1_loop_16:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L084dec1_loop_16
+ jnz .L089dec1_loop_16
.byte 102,15,56,223,209
xorps %xmm7,%xmm2
movaps %xmm6,%xmm7
subl $16,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L081cbc_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+.L085cbc_dec_two:
+ call _aesni_decrypt2
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movaps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leal 16(%edi),%edi
movaps %xmm5,%xmm7
subl $32,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L082cbc_dec_three:
+.L086cbc_dec_three:
call _aesni_decrypt3
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
xorps %xmm5,%xmm4
movups %xmm2,(%edi)
movaps %xmm4,%xmm2
+ pxor %xmm4,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
leal 32(%edi),%edi
movups 32(%esi),%xmm7
subl $48,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L083cbc_dec_four:
+.L087cbc_dec_four:
call _aesni_decrypt4
movups 16(%esi),%xmm1
movups 32(%esi),%xmm0
movups %xmm2,(%edi)
xorps %xmm1,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
leal 48(%edi),%edi
movaps %xmm5,%xmm2
+ pxor %xmm5,%xmm5
subl $64,%eax
-.L079cbc_dec_tail_collected:
+ jmp .L088cbc_dec_tail_collected
+.align 16
+.L083cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+.L088cbc_dec_tail_collected:
andl $15,%eax
- jnz .L085cbc_dec_tail_partial
+ jnz .L090cbc_dec_tail_partial
movups %xmm2,(%edi)
- jmp .L075cbc_ret
+ pxor %xmm0,%xmm0
+ jmp .L079cbc_ret
.align 16
-.L085cbc_dec_tail_partial:
+.L090cbc_dec_tail_partial:
movaps %xmm2,(%esp)
+ pxor %xmm0,%xmm0
movl $16,%ecx
movl %esp,%esi
subl %eax,%ecx
.long 2767451785
-.L075cbc_ret:
+ movdqa %xmm2,(%esp)
+.L079cbc_ret:
movl 16(%esp),%esp
movl 36(%esp),%ebp
+ pxor %xmm2,%xmm2
+ pxor %xmm1,%xmm1
movups %xmm7,(%ebp)
-.L070cbc_abort:
+ pxor %xmm7,%xmm7
+.L074cbc_abort:
popl %edi
popl %esi
popl %ebx
.type _aesni_set_encrypt_key,@function
.align 16
_aesni_set_encrypt_key:
+ pushl %ebp
+ pushl %ebx
testl %eax,%eax
- jz .L086bad_pointer
+ jz .L091bad_pointer
testl %edx,%edx
- jz .L086bad_pointer
+ jz .L091bad_pointer
+ call .L092pic
+.L092pic:
+ popl %ebx
+ leal .Lkey_const-.L092pic(%ebx),%ebx
+ leal _gnutls_x86_cpuid_s,%ebp
movups (%eax),%xmm0
xorps %xmm4,%xmm4
+ movl 4(%ebp),%ebp
leal 16(%edx),%edx
+ andl $268437504,%ebp
cmpl $256,%ecx
- je .L08714rounds
+ je .L09314rounds
cmpl $192,%ecx
- je .L08812rounds
+ je .L09412rounds
cmpl $128,%ecx
- jne .L089bad_keybits
+ jne .L095bad_keybits
.align 16
-.L09010rounds:
+.L09610rounds:
+ cmpl $268435456,%ebp
+ je .L09710rounds_alt
movl $9,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,200,1
- call .L091key_128_cold
+ call .L098key_128_cold
.byte 102,15,58,223,200,2
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,4
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,8
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,16
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,32
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,64
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,128
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,27
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,54
- call .L092key_128
+ call .L099key_128
movups %xmm0,(%edx)
movl %ecx,80(%edx)
- xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L092key_128:
+.L099key_128:
movups %xmm0,(%edx)
leal 16(%edx),%edx
-.L091key_128_cold:
+.L098key_128_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
xorps %xmm1,%xmm0
ret
.align 16
-.L08812rounds:
+.L09710rounds_alt:
+ movdqa (%ebx),%xmm5
+ movl $8,%ecx
+ movdqa 32(%ebx),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,-16(%edx)
+.L101loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leal 16(%edx),%edx
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%edx)
+ movdqa %xmm0,%xmm2
+ decl %ecx
+ jnz .L101loop_key128
+ movdqa 48(%ebx),%xmm4
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%edx)
+ movl $9,%ecx
+ movl %ecx,96(%edx)
+ jmp .L100good_key
+.align 16
+.L09412rounds:
movq 16(%eax),%xmm2
+ cmpl $268435456,%ebp
+ je .L10212rounds_alt
movl $11,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,202,1
- call .L093key_192a_cold
+ call .L103key_192a_cold
.byte 102,15,58,223,202,2
- call .L094key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,4
- call .L095key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,8
- call .L094key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,16
- call .L095key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,32
- call .L094key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,64
- call .L095key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,128
- call .L094key_192b
+ call .L104key_192b
movups %xmm0,(%edx)
movl %ecx,48(%edx)
- xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L095key_192a:
+.L105key_192a:
movups %xmm0,(%edx)
leal 16(%edx),%edx
.align 16
-.L093key_192a_cold:
+.L103key_192a_cold:
movaps %xmm2,%xmm5
-.L096key_192b_warm:
+.L106key_192b_warm:
shufps $16,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
pxor %xmm3,%xmm2
ret
.align 16
-.L094key_192b:
+.L104key_192b:
movaps %xmm0,%xmm3
shufps $68,%xmm0,%xmm5
movups %xmm5,(%edx)
shufps $78,%xmm2,%xmm3
movups %xmm3,16(%edx)
leal 32(%edx),%edx
- jmp .L096key_192b_warm
+ jmp .L106key_192b_warm
+.align 16
+.L10212rounds_alt:
+ movdqa 16(%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $8,%ecx
+ movdqu %xmm0,-16(%edx)
+.L107loop_key192:
+ movq %xmm2,(%edx)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leal 24(%edx),%edx
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%edx)
+ decl %ecx
+ jnz .L107loop_key192
+ movl $11,%ecx
+ movl %ecx,32(%edx)
+ jmp .L100good_key
.align 16
-.L08714rounds:
+.L09314rounds:
movups 16(%eax),%xmm2
- movl $13,%ecx
leal 16(%edx),%edx
+ cmpl $268435456,%ebp
+ je .L10814rounds_alt
+ movl $13,%ecx
movups %xmm0,-32(%edx)
movups %xmm2,-16(%edx)
.byte 102,15,58,223,202,1
- call .L097key_256a_cold
+ call .L109key_256a_cold
.byte 102,15,58,223,200,1
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,2
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,2
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,4
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,4
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,8
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,8
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,16
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,16
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,32
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,32
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,64
- call .L099key_256a
+ call .L111key_256a
movups %xmm0,(%edx)
movl %ecx,16(%edx)
xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L099key_256a:
+.L111key_256a:
movups %xmm2,(%edx)
leal 16(%edx),%edx
-.L097key_256a_cold:
+.L109key_256a_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
xorps %xmm1,%xmm0
ret
.align 16
-.L098key_256b:
+.L110key_256b:
movups %xmm0,(%edx)
leal 16(%edx),%edx
shufps $16,%xmm2,%xmm4
shufps $170,%xmm1,%xmm1
xorps %xmm1,%xmm2
ret
+.align 16
+.L10814rounds_alt:
+ movdqa (%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $7,%ecx
+ movdqu %xmm0,-32(%edx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,-16(%edx)
+.L112loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ decl %ecx
+ jz .L113done_key256
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%edx)
+ leal 32(%edx),%edx
+ movdqa %xmm2,%xmm1
+ jmp .L112loop_key256
+.L113done_key256:
+ movl $13,%ecx
+ movl %ecx,16(%edx)
+.L100good_key:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ xorl %eax,%eax
+ popl %ebx
+ popl %ebp
+ ret
.align 4
-.L086bad_pointer:
+.L091bad_pointer:
movl $-1,%eax
+ popl %ebx
+ popl %ebp
ret
.align 4
-.L089bad_keybits:
+.L095bad_keybits:
+ pxor %xmm0,%xmm0
movl $-2,%eax
+ popl %ebx
+ popl %ebp
ret
.size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key
.globl aesni_set_encrypt_key
movl 12(%esp),%edx
shll $4,%ecx
testl %eax,%eax
- jnz .L100dec_key_ret
+ jnz .L114dec_key_ret
leal 16(%edx,%ecx,1),%eax
movups (%edx),%xmm0
movups (%eax),%xmm1
movups %xmm1,(%edx)
leal 16(%edx),%edx
leal -16(%eax),%eax
-.L101dec_key_inverse:
+.L115dec_key_inverse:
movups (%edx),%xmm0
movups (%eax),%xmm1
.byte 102,15,56,219,192
movups %xmm0,16(%eax)
movups %xmm1,-16(%edx)
cmpl %edx,%eax
- ja .L101dec_key_inverse
+ ja .L115dec_key_inverse
movups (%edx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
xorl %eax,%eax
-.L100dec_key_ret:
+.L114dec_key_ret:
ret
.size aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin
+.align 64
+.Lkey_const:
+.long 202313229,202313229,202313229,202313229
+.long 67569157,67569157,67569157,67569157
+.long 1,1,1,1
+.long 27,27,27,27
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
.byte 115,108,46,111,114,103,62,0
-
+.comm _gnutls_x86_cpuid_s,16,4
.section .note.GNU-stack,"",%progbits
-
-
# *** This file is auto-generated ***
#
.text
+
.globl aesni_encrypt
.type aesni_encrypt,@function
.align 16
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
- jnz .Loop_enc1_1
+ jnz .Loop_enc1_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
.byte 0xf3,0xc3
.size aesni_encrypt,.-aesni_encrypt
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
- jnz .Loop_dec1_2
+ jnz .Loop_dec1_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
.byte 0xf3,0xc3
.size aesni_decrypt, .-aesni_decrypt
+.type _aesni_encrypt2,@function
+.align 16
+_aesni_encrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Lenc_loop2:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Lenc_loop2
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ .byte 0xf3,0xc3
+.size _aesni_encrypt2,.-_aesni_encrypt2
+.type _aesni_decrypt2,@function
+.align 16
+_aesni_decrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Ldec_loop2:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Ldec_loop2
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+ .byte 0xf3,0xc3
+.size _aesni_decrypt2,.-_aesni_decrypt2
.type _aesni_encrypt3,@function
.align 16
_aesni_encrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
.Lenc_loop3:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop3
.byte 102,15,56,220,209
.align 16
_aesni_decrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
.Ldec_loop3:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop3
.byte 102,15,56,222,209
.align 16
_aesni_encrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
.Lenc_loop4:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop4
.byte 102,15,56,220,209
.align 16
_aesni_decrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
.Ldec_loop4:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop4
.byte 102,15,56,222,209
.align 16
_aesni_encrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
+.byte 102,15,56,220,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,220,241
- movups (%rcx),%xmm0
-.byte 102,15,56,220,249
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
jmp .Lenc_loop6_enter
.align 16
.Lenc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
+.Lenc_loop6_enter:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lenc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop6
.byte 102,15,56,220,209
.align 16
_aesni_decrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
+.byte 102,15,56,222,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,222,241
- movups (%rcx),%xmm0
-.byte 102,15,56,222,249
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
jmp .Ldec_loop6_enter
.align 16
.Ldec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
+.Ldec_loop6_enter:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.Ldec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop6
.byte 102,15,56,222,209
.align 16
_aesni_encrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,209
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,220,241
pxor %xmm0,%xmm8
-.byte 102,15,56,220,249
+.byte 102,15,56,220,217
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
- jmp .Lenc_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Lenc_loop8_inner
.align 16
.Lenc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
+.Lenc_loop8_inner:
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
.Lenc_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop8
.byte 102,15,56,220,209
.align 16
_aesni_decrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,222,209
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,222,241
pxor %xmm0,%xmm8
-.byte 102,15,56,222,249
+.byte 102,15,56,222,217
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
- jmp .Ldec_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Ldec_loop8_inner
.align 16
.Ldec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
+.Ldec_loop8_inner:
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
.Ldec_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop8
.byte 102,15,56,222,209
testl %r8d,%r8d
jz .Lecb_decrypt
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb .Lecb_enc_tail
movdqu (%rdi),%xmm2
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp .Lecb_enc_loop8_enter
.align 16
.Lecb_enc_loop8:
call _aesni_encrypt8
- subq $128,%rdx
+ subq $0x80,%rdx
jnc .Lecb_enc_loop8
movups %xmm2,(%rsi)
movups %xmm8,96(%rsi)
movups %xmm9,112(%rsi)
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz .Lecb_ret
.Lecb_enc_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lecb_enc_one
movups 16(%rdi),%xmm3
je .Lecb_enc_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lecb_enc_three
movups 48(%rdi),%xmm5
je .Lecb_enc_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb .Lecb_enc_five
movups 80(%rdi),%xmm7
je .Lecb_enc_six
movdqu 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
call _aesni_encrypt8
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_3
+ jnz .Loop_enc1_3
.byte 102,15,56,221,209
movups %xmm2,(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_enc_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
+ call _aesni_encrypt2
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_decrypt:
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb .Lecb_dec_tail
movdqu (%rdi),%xmm2
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp .Lecb_dec_loop8_enter
.align 16
.Lecb_dec_loop8:
call _aesni_decrypt8
movups (%r11),%xmm0
- subq $128,%rdx
+ subq $0x80,%rdx
jnc .Lecb_dec_loop8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movq %r11,%rcx
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movl %r10d,%eax
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
movups %xmm9,112(%rsi)
+ pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz .Lecb_ret
.Lecb_dec_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lecb_dec_one
movups 16(%rdi),%xmm3
je .Lecb_dec_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lecb_dec_three
movups 48(%rdi),%xmm5
je .Lecb_dec_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb .Lecb_dec_five
movups 80(%rdi),%xmm7
je .Lecb_dec_six
movups 96(%rdi),%xmm8
movups (%rcx),%xmm0
+ xorps %xmm9,%xmm9
call _aesni_decrypt8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp .Lecb_ret
.align 16
.Lecb_dec_one:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_4
+ jnz .Loop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp .Lecb_ret
.align 16
.Lecb_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+ call _aesni_decrypt2
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
jmp .Lecb_ret
.align 16
.Lecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
jmp .Lecb_ret
.align 16
.Lecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
jmp .Lecb_ret
.align 16
.Lecb_dec_five:
xorps %xmm7,%xmm7
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
jmp .Lecb_ret
.align 16
.Lecb_dec_six:
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
.Lecb_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
.byte 0xf3,0xc3
.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
.globl aesni_ccm64_encrypt_blocks
.align 16
aesni_ccm64_encrypt_blocks:
movl 240(%rcx),%eax
- movdqu (%r8),%xmm9
- movdqa .Lincrement64(%rip),%xmm6
+ movdqu (%r8),%xmm6
+ movdqa .Lincrement64(%rip),%xmm9
movdqa .Lbswap_mask(%rip),%xmm7
- shrl $1,%eax
+ shll $4,%eax
+ movl $16,%r10d
leaq 0(%rcx),%r11
movdqu (%r9),%xmm3
- movdqa %xmm9,%xmm2
- movl %eax,%r10d
-.byte 102,68,15,56,0,207
+ movdqa %xmm6,%xmm2
+ leaq 32(%rcx,%rax,1),%rcx
+.byte 102,15,56,0,247
+ subq %rax,%r10
jmp .Lccm64_enc_outer
.align 16
.Lccm64_enc_outer:
movups (%r11),%xmm0
- movl %r10d,%eax
+ movq %r10,%rax
movups (%rdi),%xmm8
xorps %xmm0,%xmm2
movups 16(%r11),%xmm1
xorps %xmm8,%xmm0
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm3
- movups (%rcx),%xmm0
+ movups 32(%r11),%xmm0
.Lccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
+ decq %rdx
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decq %rdx
leaq 16(%rdi),%rdi
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
- leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
+ leaq 16(%rsi),%rsi
jnz .Lccm64_enc_outer
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
.globl aesni_ccm64_decrypt_blocks
.align 16
aesni_ccm64_decrypt_blocks:
movl 240(%rcx),%eax
- movups (%r8),%xmm9
+ movups (%r8),%xmm6
movdqu (%r9),%xmm3
- movdqa .Lincrement64(%rip),%xmm6
+ movdqa .Lincrement64(%rip),%xmm9
movdqa .Lbswap_mask(%rip),%xmm7
- movaps %xmm9,%xmm2
+ movaps %xmm6,%xmm2
movl %eax,%r10d
movq %rcx,%r11
-.byte 102,68,15,56,0,207
+.byte 102,15,56,0,247
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_5
+ jnz .Loop_enc1_5
.byte 102,15,56,221,209
+ shll $4,%r10d
+ movl $16,%eax
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
leaq 16(%rdi),%rdi
+ subq %r10,%rax
+ leaq 32(%r11,%r10,1),%rcx
+ movq %rax,%r10
jmp .Lccm64_dec_outer
.align 16
.Lccm64_dec_outer:
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
- movl %r10d,%eax
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
jz .Lccm64_dec_break
movups (%r11),%xmm0
- shrl $1,%eax
+ movq %r10,%rax
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm2
xorps %xmm8,%xmm3
- movups (%rcx),%xmm0
-
+ movups 32(%r11),%xmm0
+ jmp .Lccm64_dec2_loop
+.align 16
.Lccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lccm64_dec2_loop
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leaq 16(%rdi),%rdi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
+ leaq 16(%rdi),%rdi
jmp .Lccm64_dec_outer
.align 16
.Lccm64_dec_break:
+ movl 240(%r11),%eax
movups (%r11),%xmm0
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
decl %eax
movups (%r11),%xmm1
leaq 16(%r11),%r11
- jnz .Loop_enc1_6
+ jnz .Loop_enc1_6
.byte 102,15,56,221,217
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
.globl aesni_ctr32_encrypt_blocks
.type aesni_ctr32_encrypt_blocks,@function
.align 16
aesni_ctr32_encrypt_blocks:
+ cmpq $1,%rdx
+ jne .Lctr32_bulk
+
+
+
+ movups (%r8),%xmm2
+ movups (%rdi),%xmm3
+ movl 240(%rcx),%edx
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_7:
+.byte 102,15,56,220,209
+ decl %edx
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_7
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ xorps %xmm2,%xmm2
+ jmp .Lctr32_epilogue
+
+.align 16
+.Lctr32_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $128,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
- cmpq $1,%rdx
- je .Lctr32_one_shortcut
+
+
movdqu (%r8),%xmm2
movdqu (%rcx),%xmm0
movdqa %xmm2,64(%rsp)
movdqa %xmm2,80(%rsp)
movdqa %xmm2,96(%rsp)
+ movq %rdx,%r10
movdqa %xmm2,112(%rsp)
- movl 240(%rcx),%eax
-
- leaq 1(%r8),%r9
- leaq 2(%r8),%r10
- bswapl %r9d
- bswapl %r10d
- xorl %r11d,%r9d
- xorl %r11d,%r10d
-.byte 102,65,15,58,34,217,3
- leaq 3(%r8),%r9
+ leaq 1(%r8),%rax
+ leaq 2(%r8),%rdx
+ bswapl %eax
+ bswapl %edx
+ xorl %r11d,%eax
+ xorl %r11d,%edx
+.byte 102,15,58,34,216,3
+ leaq 3(%r8),%rax
movdqa %xmm3,16(%rsp)
-.byte 102,65,15,58,34,226,3
- bswapl %r9d
+.byte 102,15,58,34,226,3
+ bswapl %eax
+ movq %r10,%rdx
leaq 4(%r8),%r10
movdqa %xmm4,32(%rsp)
- xorl %r11d,%r9d
+ xorl %r11d,%eax
bswapl %r10d
-.byte 102,65,15,58,34,233,3
+.byte 102,15,58,34,232,3
xorl %r11d,%r10d
movdqa %xmm5,48(%rsp)
leaq 5(%r8),%r9
movl %r10d,64+12(%rsp)
bswapl %r9d
leaq 6(%r8),%r10
+ movl 240(%rcx),%eax
xorl %r11d,%r9d
bswapl %r10d
movl %r9d,80+12(%rsp)
leaq 7(%r8),%r9
movl %r10d,96+12(%rsp)
bswapl %r9d
+ movl _gnutls_x86_cpuid_s+4(%rip),%r10d
xorl %r11d,%r9d
+ andl $71303168,%r10d
movl %r9d,112+12(%rsp)
movups 16(%rcx),%xmm1
cmpq $8,%rdx
jb .Lctr32_tail
+ subq $6,%rdx
+ cmpl $4194304,%r10d
+ je .Lctr32_6x
+
leaq 128(%rcx),%rcx
- subq $8,%rdx
+ subq $2,%rdx
jmp .Lctr32_loop8
+.align 16
+.Lctr32_6x:
+ shll $4,%eax
+ movl $48,%r10d
+ bswapl %r11d
+ leaq 32(%rcx,%rax,1),%rcx
+ subq %rax,%r10
+ jmp .Lctr32_loop6
+
+.align 16
+.Lctr32_loop6:
+ addl $6,%r8d
+ movups -48(%rcx,%r10,1),%xmm0
+.byte 102,15,56,220,209
+ movl %r8d,%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,217
+.byte 0x0f,0x38,0xf1,0x44,0x24,12
+ leal 1(%r8),%eax
+.byte 102,15,56,220,225
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,28
+.byte 102,15,56,220,233
+ leal 2(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,241
+.byte 0x0f,0x38,0xf1,0x44,0x24,44
+ leal 3(%r8),%eax
+.byte 102,15,56,220,249
+ movups -32(%rcx,%r10,1),%xmm1
+ xorl %r11d,%eax
+
+.byte 102,15,56,220,208
+.byte 0x0f,0x38,0xf1,0x44,0x24,60
+ leal 4(%r8),%eax
+.byte 102,15,56,220,216
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,76
+.byte 102,15,56,220,224
+ leal 5(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,232
+.byte 0x0f,0x38,0xf1,0x44,0x24,92
+ movq %r10,%rax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%rcx,%r10,1),%xmm0
+
+ call .Lenc_loop6
+
+ movdqu (%rdi),%xmm8
+ movdqu 16(%rdi),%xmm9
+ movdqu 32(%rdi),%xmm10
+ movdqu 48(%rdi),%xmm11
+ movdqu 64(%rdi),%xmm12
+ movdqu 80(%rdi),%xmm13
+ leaq 96(%rdi),%rdi
+ movups -64(%rcx,%r10,1),%xmm1
+ pxor %xmm2,%xmm8
+ movaps 0(%rsp),%xmm2
+ pxor %xmm3,%xmm9
+ movaps 16(%rsp),%xmm3
+ pxor %xmm4,%xmm10
+ movaps 32(%rsp),%xmm4
+ pxor %xmm5,%xmm11
+ movaps 48(%rsp),%xmm5
+ pxor %xmm6,%xmm12
+ movaps 64(%rsp),%xmm6
+ pxor %xmm7,%xmm13
+ movaps 80(%rsp),%xmm7
+ movdqu %xmm8,(%rsi)
+ movdqu %xmm9,16(%rsi)
+ movdqu %xmm10,32(%rsi)
+ movdqu %xmm11,48(%rsi)
+ movdqu %xmm12,64(%rsi)
+ movdqu %xmm13,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ subq $6,%rdx
+ jnc .Lctr32_loop6
+
+ addq $6,%rdx
+ jz .Lctr32_done
+
+ leal -48(%r10),%eax
+ leaq -80(%rcx,%r10,1),%rcx
+ negl %eax
+ shrl $4,%eax
+ jmp .Lctr32_tail
+
.align 32
.Lctr32_loop8:
addl $8,%r8d
movups 32-128(%rcx),%xmm0
.byte 102,15,56,220,225
xorl %r11d,%r9d
+ nop
.byte 102,15,56,220,233
movl %r9d,0+12(%rsp)
leaq 1(%r8),%r9
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 48-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,16+12(%rsp)
leaq 2(%r8),%r9
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 64-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,32+12(%rsp)
leaq 3(%r8),%r9
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 80-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,48+12(%rsp)
leaq 4(%r8),%r9
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 96-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,64+12(%rsp)
leaq 5(%r8),%r9
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 112-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,80+12(%rsp)
leaq 6(%r8),%r9
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 128-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,96+12(%rsp)
leaq 7(%r8),%r9
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 144-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
.byte 102,15,56,220,224
xorl %r11d,%r9d
+ movdqu 0(%rdi),%xmm10
.byte 102,15,56,220,232
movl %r9d,112+12(%rsp)
+ cmpl $11,%eax
.byte 102,15,56,220,240
.byte 102,15,56,220,248
.byte 102,68,15,56,220,192
- movdqu 0(%rdi),%xmm10
.byte 102,68,15,56,220,200
movups 160-128(%rcx),%xmm0
- cmpl $11,%eax
jb .Lctr32_enc_done
.byte 102,15,56,220,209
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 224-128(%rcx),%xmm0
+ jmp .Lctr32_enc_done
+.align 16
.Lctr32_enc_done:
movdqu 16(%rdi),%xmm11
pxor %xmm0,%xmm10
pxor %xmm0,%xmm13
movdqu 80(%rdi),%xmm15
pxor %xmm0,%xmm14
-.byte 102,15,56,220,209
pxor %xmm0,%xmm15
+.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movdqu 96(%rdi),%xmm1
+ leaq 128(%rdi),%rdi
.byte 102,65,15,56,221,210
pxor %xmm0,%xmm1
- movdqu 112(%rdi),%xmm10
- leaq 128(%rdi),%rdi
+ movdqu 112-128(%rdi),%xmm10
.byte 102,65,15,56,221,219
pxor %xmm0,%xmm10
movdqa 0(%rsp),%xmm11
.byte 102,65,15,56,221,228
- movdqa 16(%rsp),%xmm12
.byte 102,65,15,56,221,237
+ movdqa 16(%rsp),%xmm12
movdqa 32(%rsp),%xmm13
.byte 102,65,15,56,221,246
- movdqa 48(%rsp),%xmm14
.byte 102,65,15,56,221,255
+ movdqa 48(%rsp),%xmm14
movdqa 64(%rsp),%xmm15
.byte 102,68,15,56,221,193
movdqa 80(%rsp),%xmm0
-.byte 102,69,15,56,221,202
movups 16-128(%rcx),%xmm1
+.byte 102,69,15,56,221,202
movups %xmm2,(%rsi)
movdqa %xmm11,%xmm2
leaq -128(%rcx),%rcx
.Lctr32_tail:
+
+
leaq 16(%rcx),%rcx
cmpq $4,%rdx
jb .Lctr32_loop3
je .Lctr32_loop4
+
+ shll $4,%eax
movdqa 96(%rsp),%xmm8
pxor %xmm9,%xmm9
movups 16(%rcx),%xmm0
.byte 102,15,56,220,209
- leaq 16(%rcx),%rcx
.byte 102,15,56,220,217
- shrl $1,%eax
+ leaq 32-16(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,220,225
- decl %eax
-.byte 102,15,56,220,233
+ addq $16,%rax
movups (%rdi),%xmm10
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
movups 16(%rdi),%xmm11
-.byte 102,15,56,220,249
movups 32(%rdi),%xmm12
+.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
- movups 16(%rcx),%xmm1
call .Lenc_loop8_enter
.Lctr32_loop4:
.byte 102,15,56,220,209
leaq 16(%rcx),%rcx
+ decl %eax
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.byte 102,15,56,220,233
movups (%rcx),%xmm1
- decl %eax
jnz .Lctr32_loop4
.byte 102,15,56,221,209
- movups (%rdi),%xmm10
.byte 102,15,56,221,217
+ movups (%rdi),%xmm10
movups 16(%rdi),%xmm11
.byte 102,15,56,221,225
- movups 32(%rdi),%xmm12
.byte 102,15,56,221,233
+ movups 32(%rdi),%xmm12
movups 48(%rdi),%xmm13
xorps %xmm10,%xmm2
.Lctr32_loop3:
.byte 102,15,56,220,209
leaq 16(%rcx),%rcx
+ decl %eax
.byte 102,15,56,220,217
.byte 102,15,56,220,225
movups (%rcx),%xmm1
- decl %eax
jnz .Lctr32_loop3
.byte 102,15,56,221,209
.byte 102,15,56,221,217
movups 32(%rdi),%xmm12
xorps %xmm12,%xmm4
movups %xmm4,32(%rsi)
- jmp .Lctr32_done
-
-.align 16
-.Lctr32_one_shortcut:
- movups (%r8),%xmm2
- movups (%rdi),%xmm10
- movl 240(%rcx),%eax
- movups (%rcx),%xmm0
- movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
- xorps %xmm0,%xmm2
-.Loop_enc1_7:
-.byte 102,15,56,220,209
- decl %eax
- movups (%rcx),%xmm1
- leaq 16(%rcx),%rcx
- jnz .Loop_enc1_7
-.byte 102,15,56,221,209
- xorps %xmm10,%xmm2
- movups %xmm2,(%rsi)
- jmp .Lctr32_done
-.align 16
.Lctr32_done:
+ xorps %xmm0,%xmm0
+ xorl %r11d,%r11d
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ movaps %xmm0,112(%rsp)
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lctr32_epilogue:
aesni_xts_encrypt:
leaq (%rsp),%rax
pushq %rbp
- subq $96,%rsp
+ subq $112,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
- movups (%r9),%xmm15
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
.Loop_enc1_8:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_enc1_8
-.byte 102,68,15,56,221,249
+ jnz .Loop_enc1_8
+.byte 102,15,56,221,209
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa .Lxts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $0x5f,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc .Lxts_enc_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq .Lxts_magic(%rip),%r8
jmp .Lxts_enc_grandloop
-.align 16
+.align 32
.Lxts_enc_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,220,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,220,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,220,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,220,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,220,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,220,208
+ pxor %xmm9,%xmm13
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,220,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,220,216
+ pxor %xmm9,%xmm14
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,220,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,220,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,220,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp .Lxts_enc_loop6_enter
-
-.align 16
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $0x5f,%xmm15,%xmm9
+ jmp .Lxts_enc_loop6
+.align 32
.Lxts_enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lxts_enc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -80(%rcx,%rax,1),%xmm0
jnz .Lxts_enc_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
.byte 102,15,56,220,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,220,249
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,208
- pand %xmm8,%xmm9
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,220,216
- pcmpgtd %xmm15,%xmm14
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,220,224
- pxor %xmm9,%xmm15
.byte 102,15,56,220,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,248
- movups 32(%rcx),%xmm0
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,249
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,220,216
paddq %xmm15,%xmm15
-.byte 102,15,56,221,208
- pand %xmm8,%xmm9
-.byte 102,15,56,221,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,221,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,220,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,220,217
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,221,84,36,0
+ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+.byte 102,15,56,221,92,36,16
+.byte 102,15,56,221,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,221,108,36,48
+.byte 102,15,56,221,116,36,64
+.byte 102,15,56,221,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc .Lxts_enc_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
.Lxts_enc_short:
+
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
addq $96,%rdx
jz .Lxts_enc_done
- cmpq $32,%rdx
+ pxor %xmm0,%xmm11
+ cmpq $0x20,%rdx
jb .Lxts_enc_one
+ pxor %xmm0,%xmm12
je .Lxts_enc_two
- cmpq $64,%rdx
+ pxor %xmm0,%xmm13
+ cmpq $0x40,%rdx
jb .Lxts_enc_three
+ pxor %xmm0,%xmm14
je .Lxts_enc_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
pxor %xmm13,%xmm5
pxor %xmm14,%xmm6
+ pxor %xmm7,%xmm7
call _aesni_encrypt6
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_9
+ jnz .Loop_enc1_9
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_encrypt3
+ call _aesni_encrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
call _aesni_encrypt4
- xorps %xmm10,%xmm2
- movdqa %xmm15,%xmm10
- xorps %xmm11,%xmm3
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_enc_done
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_10
+ jnz .Loop_enc1_10
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movups %xmm2,-16(%rsi)
.Lxts_enc_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lxts_enc_epilogue:
aesni_xts_decrypt:
leaq (%rsp),%rax
pushq %rbp
- subq $96,%rsp
+ subq $112,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
- movups (%r9),%xmm15
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
.Loop_enc1_11:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_enc1_11
-.byte 102,68,15,56,221,249
+ jnz .Loop_enc1_11
+.byte 102,15,56,221,209
xorl %eax,%eax
testq $15,%rdx
setnz %al
shlq $4,%rax
subq %rax,%rdx
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa .Lxts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $0x5f,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc .Lxts_dec_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq .Lxts_magic(%rip),%r8
jmp .Lxts_dec_grandloop
-.align 16
+.align 32
.Lxts_dec_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,222,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,222,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,222,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,222,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,222,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,222,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,222,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,222,208
+ pxor %xmm9,%xmm13
movdqa %xmm11,16(%rsp)
+.byte 102,15,56,222,216
+ pxor %xmm9,%xmm14
+ movdqa %xmm12,32(%rsp)
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ pxor %xmm9,%xmm8
+ movdqa %xmm14,64(%rsp)
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $0x5f,%xmm15,%xmm9
+ jmp .Lxts_dec_loop6
+.align 32
+.Lxts_dec_loop6:
+.byte 102,15,56,222,209
.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
- movdqa %xmm12,32(%rsp)
.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
- movdqa %xmm14,64(%rsp)
.byte 102,15,56,222,241
- movdqa %xmm15,80(%rsp)
.byte 102,15,56,222,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp .Lxts_dec_loop6_enter
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
-.align 16
-.Lxts_dec_loop6:
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups -80(%rcx,%rax,1),%xmm0
+ jnz .Lxts_dec_loop6
+
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- decl %eax
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,222,249
-.Lxts_dec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
+
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,208
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,222,224
.byte 102,15,56,222,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
- jnz .Lxts_dec_loop6
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,249
- movups 16(%rcx),%xmm1
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,208
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
.byte 102,15,56,222,216
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,222,224
- pxor %xmm9,%xmm15
.byte 102,15,56,222,232
.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
.byte 102,15,56,222,248
- movups 32(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm0
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
.byte 102,15,56,222,241
.byte 102,15,56,222,249
+ movups 16(%r11),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
- paddq %xmm15,%xmm15
-.byte 102,15,56,223,208
- pand %xmm8,%xmm9
-.byte 102,15,56,223,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,223,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
-
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ pxor %xmm15,%xmm14
+.byte 102,15,56,223,84,36,0
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+.byte 102,15,56,223,92,36,16
+.byte 102,15,56,223,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,223,108,36,48
+.byte 102,15,56,223,116,36,64
+.byte 102,15,56,223,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc .Lxts_dec_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
.Lxts_dec_short:
+
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
addq $96,%rdx
jz .Lxts_dec_done
- cmpq $32,%rdx
+ pxor %xmm0,%xmm12
+ cmpq $0x20,%rdx
jb .Lxts_dec_one
+ pxor %xmm0,%xmm13
je .Lxts_dec_two
- cmpq $64,%rdx
+ pxor %xmm0,%xmm14
+ cmpq $0x40,%rdx
jb .Lxts_dec_three
je .Lxts_dec_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
pcmpgtd %xmm15,%xmm14
movdqu %xmm6,64(%rsi)
leaq 80(%rsi),%rsi
- pshufd $19,%xmm14,%xmm11
+ pshufd $0x13,%xmm14,%xmm11
andq $15,%r9
jz .Lxts_dec_ret
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_12
+ jnz .Loop_dec1_12
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_decrypt3
+ call _aesni_decrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
xorps %xmm10,%xmm2
movdqa %xmm13,%xmm10
xorps %xmm11,%xmm3
- movdqa %xmm15,%xmm11
+ movdqa %xmm14,%xmm11
xorps %xmm12,%xmm4
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
.align 16
.Lxts_dec_four:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movups (%rdi),%xmm2
- pand %xmm8,%xmm9
movups 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movups 32(%rdi),%xmm4
xorps %xmm10,%xmm2
movups 48(%rdi),%xmm5
call _aesni_decrypt4
- xorps %xmm10,%xmm2
+ pxor %xmm10,%xmm2
movdqa %xmm14,%xmm10
- xorps %xmm11,%xmm3
+ pxor %xmm11,%xmm3
movdqa %xmm15,%xmm11
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_dec_done
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_13
+ jnz .Loop_dec1_13
.byte 102,15,56,223,209
xorps %xmm11,%xmm2
movups %xmm2,(%rsi)
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_14
+ jnz .Loop_dec1_14
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movups %xmm2,(%rsi)
.Lxts_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lxts_dec_epilogue:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_15
+ jnz .Loop_enc1_15
.byte 102,15,56,221,209
movl %r10d,%eax
movq %r11,%rcx
jnc .Lcbc_enc_loop
addq $16,%rdx
jnz .Lcbc_enc_tail
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%r8)
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
jmp .Lcbc_ret
.Lcbc_enc_tail:
movq %rdx,%rcx
xchgq %rdi,%rsi
-.long 0x9066A4F3
+.long 0x9066A4F3
movl $16,%ecx
subq %rdx,%rcx
xorl %eax,%eax
-.long 0x9066AAF3
+.long 0x9066AAF3
leaq -16(%rdi),%rdi
movl %r10d,%eax
movq %rdi,%rsi
movq %r11,%rcx
xorq %rdx,%rdx
- jmp .Lcbc_enc_loop
+ jmp .Lcbc_enc_loop
.align 16
.Lcbc_decrypt:
+ cmpq $16,%rdx
+ jne .Lcbc_decrypt_bulk
+
+
+
+ movdqu (%rdi),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa %xmm2,%xmm4
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_16:
+.byte 102,15,56,222,209
+ decl %r10d
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_16
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movdqu %xmm4,(%r8)
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp .Lcbc_ret
+.align 16
+.Lcbc_decrypt_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $16,%rsp
leaq -8(%rax),%rbp
movups (%r8),%xmm10
movl %r10d,%eax
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe .Lcbc_dec_tail
movups (%rcx),%xmm0
movdqa %xmm5,%xmm14
movdqu 80(%rdi),%xmm7
movdqa %xmm6,%xmm15
- cmpq $112,%rdx
+ movl _gnutls_x86_cpuid_s+4(%rip),%r9d
+ cmpq $0x70,%rdx
jbe .Lcbc_dec_six_or_seven
- subq $112,%rdx
+ andl $71303168,%r9d
+ subq $0x50,%rdx
+ cmpl $4194304,%r9d
+ je .Lcbc_dec_loop6_enter
+ subq $0x20,%rdx
leaq 112(%rcx),%rcx
jmp .Lcbc_dec_loop8_enter
.align 16
movups 16-112(%rcx),%xmm1
pxor %xmm0,%xmm4
xorq %r11,%r11
- cmpq $112,%rdx
+ cmpq $0x70,%rdx
pxor %xmm0,%xmm5
pxor %xmm0,%xmm6
pxor %xmm0,%xmm7
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
- setnc %r11b
.byte 102,68,15,56,222,193
+ setnc %r11b
shlq $7,%r11
.byte 102,68,15,56,222,201
addq %rdi,%r11
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 64-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 80-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 96-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 112-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 128-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 144-112(%rcx),%xmm1
+ cmpl $11,%eax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 160-112(%rcx),%xmm0
- cmpl $11,%eax
jb .Lcbc_dec_done
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 176-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 208-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 224-112(%rcx),%xmm0
+ jmp .Lcbc_dec_done
+.align 16
.Lcbc_dec_done:
.byte 102,15,56,222,209
- pxor %xmm0,%xmm10
.byte 102,15,56,222,217
+ pxor %xmm0,%xmm10
pxor %xmm0,%xmm11
.byte 102,15,56,222,225
- pxor %xmm0,%xmm12
.byte 102,15,56,222,233
+ pxor %xmm0,%xmm12
pxor %xmm0,%xmm13
.byte 102,15,56,222,241
- pxor %xmm0,%xmm14
.byte 102,15,56,222,249
+ pxor %xmm0,%xmm14
pxor %xmm0,%xmm15
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
.byte 102,65,15,56,223,219
pxor %xmm0,%xmm10
movdqu 112(%rdi),%xmm0
- leaq 128(%rdi),%rdi
.byte 102,65,15,56,223,228
+ leaq 128(%rdi),%rdi
movdqu 0(%r11),%xmm11
.byte 102,65,15,56,223,237
- movdqu 16(%r11),%xmm12
.byte 102,65,15,56,223,246
+ movdqu 16(%r11),%xmm12
movdqu 32(%r11),%xmm13
.byte 102,65,15,56,223,255
- movdqu 48(%r11),%xmm14
.byte 102,68,15,56,223,193
+ movdqu 48(%r11),%xmm14
movdqu 64(%r11),%xmm15
.byte 102,69,15,56,223,202
movdqa %xmm0,%xmm10
movups %xmm8,96(%rsi)
leaq 112(%rsi),%rsi
- subq $128,%rdx
+ subq $0x80,%rdx
ja .Lcbc_dec_loop8
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
- addq $112,%rdx
- jle .Lcbc_dec_tail_collected
+ addq $0x70,%rdx
+ jle .Lcbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe .Lcbc_dec_tail
movaps %xmm11,%xmm2
.Lcbc_dec_six_or_seven:
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
ja .Lcbc_dec_seven
movaps %xmm7,%xmm8
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
leaq 80(%rsi),%rsi
movdqa %xmm7,%xmm2
+ pxor %xmm7,%xmm7
jmp .Lcbc_dec_tail_collected
.align 16
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
pxor %xmm9,%xmm8
movdqu %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
leaq 96(%rsi),%rsi
movdqa %xmm8,%xmm2
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_loop6:
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+.Lcbc_dec_loop6_enter:
+ leaq 96(%rdi),%rdi
+ movdqa %xmm7,%xmm8
+
+ call _aesni_decrypt6
+
+ pxor %xmm10,%xmm2
+ movdqa %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movq %r11,%rcx
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movl %r10d,%eax
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ subq $0x60,%rdx
+ ja .Lcbc_dec_loop6
+
+ movdqa %xmm7,%xmm2
+ addq $0x50,%rdx
+ jle .Lcbc_dec_clear_tail_collected
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+
.Lcbc_dec_tail:
movups (%rdi),%xmm2
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_one
movups 16(%rdi),%xmm3
movaps %xmm2,%xmm11
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_two
movups 32(%rdi),%xmm4
movaps %xmm3,%xmm12
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_three
movups 48(%rdi),%xmm5
movaps %xmm4,%xmm13
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_four
movups 64(%rdi),%xmm6
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
leaq 64(%rsi),%rsi
movdqa %xmm6,%xmm2
- subq $16,%rdx
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ subq $0x10,%rdx
jmp .Lcbc_dec_tail_collected
.align 16
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
-.Loop_dec1_16:
+.Loop_dec1_17:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_16
+ jnz .Loop_dec1_17
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movaps %xmm11,%xmm10
.align 16
.Lcbc_dec_two:
movaps %xmm3,%xmm12
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+ call _aesni_decrypt2
pxor %xmm10,%xmm2
movaps %xmm12,%xmm10
pxor %xmm11,%xmm3
movdqu %xmm2,(%rsi)
movdqa %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leaq 16(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movdqa %xmm4,%xmm2
+ pxor %xmm4,%xmm4
leaq 32(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movdqa %xmm5,%xmm2
+ pxor %xmm5,%xmm5
leaq 48(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
+.Lcbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
.Lcbc_dec_tail_collected:
movups %xmm10,(%r8)
andq $15,%rdx
jnz .Lcbc_dec_tail_partial
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp .Lcbc_dec_ret
.align 16
.Lcbc_dec_tail_partial:
movaps %xmm2,(%rsp)
+ pxor %xmm2,%xmm2
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
leaq (%rsp),%rsi
-.long 0x9066A4F3
+.long 0x9066A4F3
+ movdqa %xmm2,(%rsp)
.Lcbc_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
leaq (%rbp),%rsp
popq %rbp
.Lcbc_ret:
.type aesni_set_decrypt_key,@function
.align 16
aesni_set_decrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
call __aesni_set_encrypt_key
shll $4,%esi
testl %eax,%eax
movups (%rdx),%xmm0
.byte 102,15,56,219,192
+ pxor %xmm1,%xmm1
movups %xmm0,(%rdi)
+ pxor %xmm0,%xmm0
.Ldec_key_ret:
addq $8,%rsp
.byte 0xf3,0xc3
.align 16
aesni_set_encrypt_key:
__aesni_set_encrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
movq $-1,%rax
testq %rdi,%rdi
jz .Lenc_key_ret
testq %rdx,%rdx
jz .Lenc_key_ret
+ movl $268437504,%r10d
movups (%rdi),%xmm0
xorps %xmm4,%xmm4
+ andl _gnutls_x86_cpuid_s+4(%rip),%r10d
leaq 16(%rdx),%rax
cmpl $256,%esi
je .L14rounds
.L10rounds:
movl $9,%esi
+ cmpl $268435456,%r10d
+ je .L10rounds_alt
+
movups %xmm0,(%rdx)
.byte 102,15,58,223,200,1
call .Lkey_expansion_128_cold
xorl %eax,%eax
jmp .Lenc_key_ret
+.align 16
+.L10rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movl $8,%r10d
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,(%rdx)
+ jmp .Loop_key128
+
+.align 16
+.Loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leaq 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ decl %r10d
+ jnz .Loop_key128
+
+ movdqa .Lkey_rcon1b(%rip),%xmm4
+
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ movl %esi,96(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
.align 16
.L12rounds:
movq 16(%rdi),%xmm2
movl $11,%esi
+ cmpl $268435456,%r10d
+ je .L12rounds_alt
+
movups %xmm0,(%rdx)
.byte 102,15,58,223,202,1
call .Lkey_expansion_192a_cold
xorq %rax,%rax
jmp .Lenc_key_ret
+.align 16
+.L12rounds_alt:
+ movdqa .Lkey_rotate192(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $8,%r10d
+ movdqu %xmm0,(%rdx)
+ jmp .Loop_key192
+
+.align 16
+.Loop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leaq 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd $0xff,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ decl %r10d
+ jnz .Loop_key192
+
+ movl %esi,32(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
.align 16
.L14rounds:
movups 16(%rdi),%xmm2
movl $13,%esi
leaq 16(%rax),%rax
+ cmpl $268435456,%r10d
+ je .L14rounds_alt
+
movups %xmm0,(%rdx)
movups %xmm2,16(%rdx)
.byte 102,15,58,223,202,1
xorq %rax,%rax
jmp .Lenc_key_ret
+.align 16
+.L14rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $7,%r10d
+ movdqu %xmm0,0(%rdx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16(%rdx)
+ jmp .Loop_key256
+
+.align 16
+.Loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ decl %r10d
+ jz .Ldone_key256
+
+ pshufd $0xff,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ leaq 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp .Loop_key256
+
+.Ldone_key256:
+ movl %esi,16(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
.align 16
.Lbad_keybits:
movq $-2,%rax
.Lenc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
addq $8,%rsp
.byte 0xf3,0xc3
.LSEH_end_set_encrypt_key:
.long 0x87,0,1,0
.Lincrement1:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+.long 0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+.long 1,1,1,1
+.Lkey_rcon1b:
+.long 0x1b,0x1b,0x1b,0x1b
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
-
.section .note.GNU-stack,"",%progbits
-
-
.size gnutls_have_cpuid,.-.L_gnutls_have_cpuid_begin
.byte 67,80,85,73,68,32,102,111,114,32,120,56,54,0
-
.section .note.GNU-stack,"",%progbits
-
-
#
.text
+
.globl gcm_gmult_4bit
.type gcm_gmult_4bit,@function
.align 16
movq $14,%rcx
movq 8(%rsi,%rax,1),%r8
movq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
movq %r8,%rdx
jmp .Loop1
.align 16
.Loop1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
movb (%rdi,%rcx,1),%al
shrq $4,%r9
js .Lbreak1
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
.align 16
.Lbreak1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rbx,1),%r8
.align 32
gcm_ghash_clmul:
.L_ghash_clmul:
- movdqa .Lbswap_mask(%rip),%xmm5
- movq $11547335547999543296,%rax
+ movdqa .Lbswap_mask(%rip),%xmm10
movdqu (%rdi),%xmm0
movdqu (%rsi),%xmm2
- movdqu 32(%rsi),%xmm10
-.byte 102,15,56,0,197
+ movdqu 32(%rsi),%xmm7
+.byte 102,65,15,56,0,194
- subq $16,%rcx
+ subq $0x10,%rcx
jz .Lodd_tail
- movdqu 16(%rsi),%xmm9
- cmpq $48,%rcx
+ movdqu 16(%rsi),%xmm6
+ movl _gnutls_x86_cpuid_s+4(%rip),%eax
+ cmpq $0x30,%rcx
jb .Lskip4x
- subq $48,%rcx
+ andl $71303168,%eax
+ cmpl $4194304,%eax
+ je .Lskip4x
+
+ subq $0x30,%rcx
+ movq $0xA040608020C0E000,%rax
movdqu 48(%rsi),%xmm14
movdqu 64(%rsi),%xmm15
- movdqu 48(%rdx),%xmm6
+ movdqu 48(%rdx),%xmm3
movdqu 32(%rdx),%xmm11
-.byte 102,15,56,0,245
-.byte 102,68,15,56,0,221
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm7
- pxor %xmm6,%xmm7
-.byte 102,15,58,68,242,0
-.byte 102,68,15,58,68,194,17
-.byte 102,65,15,58,68,250,0
+.byte 102,65,15,56,0,218
+.byte 102,69,15,56,0,218
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
pxor %xmm11,%xmm12
-.byte 102,69,15,58,68,217,0
-.byte 102,69,15,58,68,233,17
- xorps %xmm11,%xmm6
-.byte 102,69,15,58,68,226,16
- xorps %xmm13,%xmm8
- movups 80(%rsi),%xmm10
- xorps %xmm12,%xmm7
+.byte 102,68,15,58,68,222,0
+.byte 102,68,15,58,68,238,17
+.byte 102,68,15,58,68,231,16
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+ xorps %xmm12,%xmm4
movdqu 16(%rdx),%xmm11
- movdqu 0(%rdx),%xmm3
-.byte 102,68,15,56,0,221
-.byte 102,15,56,0,221
+ movdqu 0(%rdx),%xmm8
+.byte 102,69,15,56,0,218
+.byte 102,69,15,56,0,194
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
- pxor %xmm3,%xmm0
+ pxor %xmm8,%xmm0
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm6
-.byte 102,69,15,58,68,226,0
- xorps %xmm13,%xmm8
+.byte 102,68,15,58,68,231,0
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jc .Ltail4x
jmp .Lmod4_loop
.align 32
.Lmod4_loop:
.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm7
+ xorps %xmm12,%xmm4
movdqu 48(%rdx),%xmm11
-.byte 102,68,15,56,0,221
+.byte 102,69,15,56,0,218
.byte 102,65,15,58,68,207,17
- xorps %xmm6,%xmm0
- movdqu 32(%rdx),%xmm6
+ xorps %xmm3,%xmm0
+ movdqu 32(%rdx),%xmm3
movdqa %xmm11,%xmm13
+.byte 102,68,15,58,68,199,16
pshufd $78,%xmm11,%xmm12
-.byte 102,65,15,58,68,218,16
- xorps %xmm8,%xmm1
+ xorps %xmm5,%xmm1
pxor %xmm11,%xmm12
-.byte 102,15,56,0,245
- movups 32(%rsi),%xmm10
+.byte 102,65,15,56,0,218
+ movups 32(%rsi),%xmm7
+ xorps %xmm4,%xmm8
.byte 102,68,15,58,68,218,0
- xorps %xmm7,%xmm3
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm7
+ pshufd $78,%xmm3,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm6,%xmm7
- pxor %xmm1,%xmm3
- movdqa %xmm3,%xmm4
- pslldq $8,%xmm3
+ pxor %xmm0,%xmm8
+ movdqa %xmm3,%xmm5
+ pxor %xmm1,%xmm8
+ pxor %xmm3,%xmm4
+ movdqa %xmm8,%xmm9
.byte 102,68,15,58,68,234,17
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- movdqa .L7_mask(%rip),%xmm3
- pxor %xmm4,%xmm1
-.byte 102,72,15,110,224
-
- pand %xmm0,%xmm3
-.byte 102,15,56,0,227
-.byte 102,69,15,58,68,226,0
- pxor %xmm0,%xmm4
- psllq $57,%xmm4
- movdqa %xmm4,%xmm3
- pslldq $8,%xmm4
-.byte 102,65,15,58,68,241,0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- movdqu 0(%rdx),%xmm3
+ pslldq $8,%xmm8
+ psrldq $8,%xmm9
+ pxor %xmm8,%xmm0
+ movdqa .L7_mask(%rip),%xmm8
+ pxor %xmm9,%xmm1
+.byte 102,76,15,110,200
+
+ pand %xmm0,%xmm8
+.byte 102,69,15,56,0,200
+ pxor %xmm0,%xmm9
+.byte 102,68,15,58,68,231,0
+ psllq $57,%xmm9
+ movdqa %xmm9,%xmm8
+ pslldq $8,%xmm9
+.byte 102,15,58,68,222,0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ movdqu 0(%rdx),%xmm8
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm9
psrlq $1,%xmm0
-.byte 102,69,15,58,68,193,17
- xorps %xmm11,%xmm6
+.byte 102,15,58,68,238,17
+ xorps %xmm11,%xmm3
movdqu 16(%rdx),%xmm11
-.byte 102,68,15,56,0,221
-.byte 102,65,15,58,68,250,16
- xorps %xmm13,%xmm8
- movups 80(%rsi),%xmm10
-.byte 102,15,56,0,221
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
+.byte 102,69,15,56,0,218
+.byte 102,15,58,68,231,16
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+.byte 102,69,15,56,0,194
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
psrlq $5,%xmm0
movdqa %xmm11,%xmm13
- pxor %xmm12,%xmm7
+ pxor %xmm12,%xmm4
pshufd $78,%xmm11,%xmm12
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
psrlq $1,%xmm0
-.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm6
pxor %xmm1,%xmm0
-
-.byte 102,69,15,58,68,226,0
- xorps %xmm13,%xmm8
-
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+
+.byte 102,68,15,58,68,231,0
+ xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jnc .Lmod4_loop
.Ltail4x:
.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm7
.byte 102,65,15,58,68,207,17
- xorps %xmm6,%xmm0
-.byte 102,65,15,58,68,218,16
- xorps %xmm8,%xmm1
+.byte 102,68,15,58,68,199,16
+ xorps %xmm12,%xmm4
+ xorps %xmm3,%xmm0
+ xorps %xmm5,%xmm1
pxor %xmm0,%xmm1
- pxor %xmm7,%xmm3
+ pxor %xmm4,%xmm8
- pxor %xmm1,%xmm3
+ pxor %xmm1,%xmm8
pxor %xmm0,%xmm1
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
+ movdqa %xmm8,%xmm9
+ psrldq $8,%xmm8
+ pslldq $8,%xmm9
+ pxor %xmm8,%xmm1
+ pxor %xmm9,%xmm0
movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- addq $64,%rcx
+ addq $0x40,%rcx
jz .Ldone
- movdqu 32(%rsi),%xmm10
- subq $16,%rcx
+ movdqu 32(%rsi),%xmm7
+ subq $0x10,%rcx
jz .Lodd_tail
.Lskip4x:
- movdqu (%rdx),%xmm3
- movdqu 16(%rdx),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
- pxor %xmm3,%xmm0
+ movdqu (%rdx),%xmm8
+ movdqu 16(%rdx),%xmm3
+.byte 102,69,15,56,0,194
+.byte 102,65,15,56,0,218
+ pxor %xmm8,%xmm0
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm3
- pxor %xmm6,%xmm3
-.byte 102,15,58,68,242,0
-.byte 102,68,15,58,68,194,17
-.byte 102,65,15,58,68,218,0
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
leaq 32(%rdx),%rdx
- subq $32,%rcx
+ nop
+ subq $0x20,%rcx
jbe .Leven_tail
+ nop
jmp .Lmod_loop
.align 32
.Lmod_loop:
movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,65,15,58,68,193,0
-.byte 102,65,15,58,68,201,17
-.byte 102,65,15,58,68,226,16
-
- pxor %xmm6,%xmm0
- pxor %xmm8,%xmm1
- movdqu (%rdx),%xmm8
-.byte 102,68,15,56,0,197
- movdqu 16(%rdx),%xmm6
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
- pxor %xmm8,%xmm1
- pxor %xmm3,%xmm4
-.byte 102,15,56,0,245
- movdqa %xmm4,%xmm3
- psrldq $8,%xmm3
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ movdqu (%rdx),%xmm9
+ pxor %xmm0,%xmm8
+.byte 102,69,15,56,0,202
+ movdqu 16(%rdx),%xmm3
+
+ pxor %xmm1,%xmm8
+ pxor %xmm9,%xmm1
+ pxor %xmm8,%xmm4
+.byte 102,65,15,56,0,218
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
- movdqa %xmm6,%xmm8
+ movdqa %xmm3,%xmm5
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
+ movdqa %xmm0,%xmm9
+ movdqa %xmm0,%xmm8
psllq $5,%xmm0
-.byte 102,15,58,68,242,0
- pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm8
+.byte 102,15,58,68,218,0
psllq $1,%xmm0
- pxor %xmm3,%xmm0
+ pxor %xmm8,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm3
+ movdqa %xmm0,%xmm8
pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- pshufd $78,%xmm8,%xmm3
- pxor %xmm8,%xmm3
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pshufd $78,%xmm5,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm5,%xmm4
-.byte 102,68,15,58,68,194,17
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm9
psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
+.byte 102,15,58,68,234,17
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
psrlq $5,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm9,%xmm0
+ leaq 32(%rdx),%rdx
psrlq $1,%xmm0
-.byte 102,65,15,58,68,218,0
+.byte 102,15,58,68,231,0
pxor %xmm1,%xmm0
- leaq 32(%rdx),%rdx
- subq $32,%rcx
+ subq $0x20,%rcx
ja .Lmod_loop
.Leven_tail:
movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,65,15,58,68,193,0
-.byte 102,65,15,58,68,201,17
-.byte 102,65,15,58,68,226,16
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
- pxor %xmm6,%xmm0
- pxor %xmm8,%xmm1
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
- pxor %xmm3,%xmm4
- movdqa %xmm4,%xmm3
- psrldq $8,%xmm3
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ pxor %xmm0,%xmm8
+ pxor %xmm1,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
movdqa %xmm0,%xmm4
jnz .Ldone
.Lodd_tail:
- movdqu (%rdx),%xmm3
-.byte 102,15,56,0,221
- pxor %xmm3,%xmm0
+ movdqu (%rdx),%xmm8
+.byte 102,69,15,56,0,194
+ pxor %xmm8,%xmm0
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,65,15,58,68,218,0
+.byte 102,15,58,68,223,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
psrlq $1,%xmm0
pxor %xmm1,%xmm0
.Ldone:
-.byte 102,15,56,0,197
+.byte 102,65,15,56,0,194
movdqu %xmm0,(%rdi)
.byte 0xf3,0xc3
.size gcm_ghash_clmul,.-gcm_ghash_clmul
.type gcm_init_avx,@function
.align 32
gcm_init_avx:
- jmp .L_init_clmul
+ vzeroupper
+
+ vmovdqu (%rsi),%xmm2
+ vpshufd $78,%xmm2,%xmm2
+
+
+ vpshufd $255,%xmm2,%xmm4
+ vpsrlq $63,%xmm2,%xmm3
+ vpsllq $1,%xmm2,%xmm2
+ vpxor %xmm5,%xmm5,%xmm5
+ vpcmpgtd %xmm4,%xmm5,%xmm5
+ vpslldq $8,%xmm3,%xmm3
+ vpor %xmm3,%xmm2,%xmm2
+
+
+ vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vpunpckhqdq %xmm2,%xmm2,%xmm6
+ vmovdqa %xmm2,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ movq $4,%r10
+ jmp .Linit_start_avx
+.align 32
+.Linit_loop_avx:
+ vpalignr $8,%xmm3,%xmm4,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+ vmovdqa %xmm0,%xmm5
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+ vpshufd $78,%xmm5,%xmm3
+ vpshufd $78,%xmm0,%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqu %xmm5,0(%rdi)
+ vpxor %xmm0,%xmm4,%xmm4
+ vmovdqu %xmm0,16(%rdi)
+ leaq 48(%rdi),%rdi
+ subq $1,%r10
+ jnz .Linit_loop_avx
+
+ vpalignr $8,%xmm4,%xmm3,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+
+ vzeroupper
+ .byte 0xf3,0xc3
.size gcm_init_avx,.-gcm_init_avx
.globl gcm_gmult_avx
.type gcm_gmult_avx,@function
.type gcm_ghash_avx,@function
.align 32
gcm_ghash_avx:
- jmp .L_ghash_clmul
+ vzeroupper
+
+ vmovdqu (%rdi),%xmm10
+ leaq .L0x1c2_polynomial(%rip),%r10
+ leaq 64(%rsi),%rsi
+ vmovdqu .Lbswap_mask(%rip),%xmm13
+ vpshufb %xmm13,%xmm10,%xmm10
+ cmpq $0x80,%rcx
+ jb .Lshort_avx
+ subq $0x80,%rcx
+
+ vmovdqu 112(%rdx),%xmm14
+ vmovdqu 0-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vmovdqu 32-64(%rsi),%xmm7
+
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm14,%xmm9,%xmm9
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 80(%rdx),%xmm14
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 48-64(%rsi),%xmm6
+ vpxor %xmm14,%xmm9,%xmm9
+ vmovdqu 64(%rdx),%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 48(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 32(%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 16(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu (%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+
+ leaq 128(%rdx),%rdx
+ cmpq $0x80,%rcx
+ jb .Ltail_avx
+
+ vpxor %xmm10,%xmm15,%xmm15
+ subq $0x80,%rcx
+ jmp .Loop8x_avx
+
+.align 32
+.Loop8x_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 112(%rdx),%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpxor %xmm15,%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
+ vmovdqu 0-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
+ vmovdqu 32-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm3,%xmm10,%xmm10
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vxorps %xmm4,%xmm11,%xmm11
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm5,%xmm12,%xmm12
+ vxorps %xmm15,%xmm8,%xmm8
+
+ vmovdqu 80(%rdx),%xmm14
+ vpxor %xmm10,%xmm12,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm11,%xmm12,%xmm12
+ vpslldq $8,%xmm12,%xmm9
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vpsrldq $8,%xmm12,%xmm12
+ vpxor %xmm9,%xmm10,%xmm10
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vxorps %xmm12,%xmm11,%xmm11
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 64(%rdx),%xmm15
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vxorps %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vmovdqu 48(%rdx),%xmm14
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 32(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+ vxorps %xmm12,%xmm10,%xmm10
+
+ vmovdqu 16(%rdx),%xmm14
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vxorps %xmm11,%xmm12,%xmm12
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu (%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm12,%xmm15,%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+ vpxor %xmm10,%xmm15,%xmm15
+
+ leaq 128(%rdx),%rdx
+ subq $0x80,%rcx
+ jnc .Loop8x_avx
+
+ addq $0x80,%rcx
+ jmp .Ltail_no_xor_avx
+
+.align 32
+.Lshort_avx:
+ vmovdqu -16(%rdx,%rcx,1),%xmm14
+ leaq (%rdx,%rcx,1),%rdx
+ vmovdqu 0-64(%rsi),%xmm6
+ vmovdqu 32-64(%rsi),%xmm7
+ vpshufb %xmm13,%xmm14,%xmm15
+
+ vmovdqa %xmm0,%xmm3
+ vmovdqa %xmm1,%xmm4
+ vmovdqa %xmm2,%xmm5
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -32(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -48(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 80-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -64(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -80(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 96-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 128-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -96(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -112(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 144-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovq 184-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jmp .Ltail_avx
+
+.align 32
+.Ltail_avx:
+ vpxor %xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+
+ vmovdqu (%r10),%xmm12
+
+ vpxor %xmm0,%xmm3,%xmm10
+ vpxor %xmm1,%xmm4,%xmm11
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vpxor %xmm10,%xmm5,%xmm5
+ vpxor %xmm11,%xmm5,%xmm5
+ vpslldq $8,%xmm5,%xmm9
+ vpsrldq $8,%xmm5,%xmm5
+ vpxor %xmm9,%xmm10,%xmm10
+ vpxor %xmm5,%xmm11,%xmm11
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm11,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ cmpq $0,%rcx
+ jne .Lshort_avx
+
+ vpshufb %xmm13,%xmm10,%xmm10
+ vmovdqu %xmm10,(%rdi)
+ vzeroupper
+ .byte 0xf3,0xc3
.size gcm_ghash_avx,.-gcm_ghash_avx
.align 64
.Lbswap_mask:
.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
-
.section .note.GNU-stack,"",%progbits
-
-
X86_FILES_ELF=elf/aesni-x86.s elf/cpuid-x86.s elf/sha1-ssse3-x86.s elf/sha256-ssse3-x86.s elf/sha512-ssse3-x86.s elf/aes-ssse3-x86.s
X86_FILES_COFF=coff/aesni-x86.s coff/cpuid-x86.s coff/sha1-ssse3-x86.s coff/sha256-ssse3-x86.s coff/sha512-ssse3-x86.s coff/aes-ssse3-x86.s
X86_FILES_MACOSX=macosx/aesni-x86.s macosx/cpuid-x86.s macosx/sha1-ssse3-x86.s macosx/sha256-ssse3-x86.s macosx/sha512-ssse3-x86.s macosx/aes-ssse3-x86.s
-X86_64_FILES_ELF=elf/aesni-x86_64.s elf/cpuid-x86_64.s elf/ghash-x86_64.s elf/sha1-ssse3-x86_64.s elf/sha512-ssse3-x86_64.s elf/aes-ssse3-x86_64.s
-X86_64_FILES_COFF=coff/aesni-x86_64.s coff/cpuid-x86_64.s coff/ghash-x86_64.s coff/sha1-ssse3-x86_64.s coff/sha512-ssse3-x86_64.s coff/aes-ssse3-x86_64.s
-X86_64_FILES_MACOSX=macosx/aesni-x86_64.s macosx/cpuid-x86_64.s macosx/ghash-x86_64.s macosx/sha1-ssse3-x86_64.s macosx/sha512-ssse3-x86_64.s macosx/aes-ssse3-x86_64.s
+X86_64_FILES_ELF=elf/aesni-x86_64.s elf/cpuid-x86_64.s elf/ghash-x86_64.s elf/sha1-ssse3-x86_64.s elf/sha512-ssse3-x86_64.s elf/aes-ssse3-x86_64.s elf/aesni-gcm-x86_64.s
+X86_64_FILES_COFF=coff/aesni-x86_64.s coff/cpuid-x86_64.s coff/ghash-x86_64.s coff/sha1-ssse3-x86_64.s coff/sha512-ssse3-x86_64.s coff/aes-ssse3-x86_64.s coff/aesni-gcm-x86_64.s
+X86_64_FILES_MACOSX=macosx/aesni-x86_64.s macosx/cpuid-x86_64.s macosx/ghash-x86_64.s macosx/sha1-ssse3-x86_64.s macosx/sha512-ssse3-x86_64.s macosx/aes-ssse3-x86_64.s macosx/aesni-gcm-x86_64.s
X86_PADLOCK_FILES_ELF=elf/e_padlock-x86.s
X86_PADLOCK_FILES_COFF=coff/e_padlock-x86.s
X86_PADLOCK_FILES_MACOSX=macosx/e_padlock-x86.s
addq $16,%r11
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
subq $1,%rax
pxor %xmm3,%xmm0
pand %xmm9,%xmm0
.byte 102,15,56,0,208
movdqa L$k_dipt+16(%rip),%xmm0
- xorq $48,%r11
+ xorq $0x30,%r11
leaq L$k_dsbd(%rip),%r10
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
pxor %xmm5,%xmm2
movdqa L$k_mc_forward+48(%rip),%xmm5
pxor %xmm2,%xmm0
- call _vpaes_preheat
+ call _vpaes_preheat
movdqa L$k_rcon(%rip),%xmm8
movdqu (%rdi),%xmm0
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
movdqu %xmm3,(%rdx)
- xorq $48,%r8
+ xorq $0x30,%r8
L$schedule_go:
cmpl $192,%esi
call _vpaes_schedule_round
decq %rsi
jz L$schedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
jmp L$oop_schedule_128
.p2align 4
L$schedule_192:
movdqu 8(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqa %xmm0,%xmm6
pxor %xmm4,%xmm4
movhlps %xmm4,%xmm6
L$oop_schedule_192:
call _vpaes_schedule_round
.byte 102,15,58,15,198,8
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_round
decq %rsi
jz L$schedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
jmp L$oop_schedule_192
.p2align 4
L$schedule_256:
movdqu 16(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movl $7,%esi
L$oop_schedule_256:
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
movdqa %xmm0,%xmm6
call _vpaes_schedule_round
decq %rsi
jz L$schedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
movdqa %xmm7,%xmm5
movdqa %xmm6,%xmm7
call _vpaes_schedule_low_round
L$schedule_mangle_last_dec:
addq $-16,%rdx
pxor L$k_s63(%rip),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqu %xmm0,(%rdx)
.p2align 4
_vpaes_schedule_192_smear:
- pshufd $128,%xmm6,%xmm1
- pshufd $254,%xmm7,%xmm0
+ pshufd $0x80,%xmm6,%xmm1
+ pshufd $0xFE,%xmm7,%xmm0
pxor %xmm1,%xmm6
pxor %xmm1,%xmm1
pxor %xmm0,%xmm6
pxor %xmm1,%xmm7
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
.byte 102,15,58,15,192,1
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
addq $-16,%r8
- andq $48,%r8
+ andq $0x30,%r8
movdqu %xmm3,(%rdx)
.byte 0xf3,0xc3
movl %eax,240(%rdx)
movl $0,%ecx
- movl $48,%r8d
+ movl $0x30,%r8d
call _vpaes_schedule_core
xorl %eax,%eax
.byte 0xf3,0xc3
L$k_dsbo:
.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
.p2align 6
--- /dev/null
+# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain copyright notices,
+# this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# * Neither the name of the Andy Polyakov nor the names of its
+# copyright holder and contributors may be used to endorse or
+# promote products derived from this software without specific
+# prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *** This file is auto-generated ***
+#
+.text
+
+
+.p2align 5
+_aesni_ctr32_ghash_6x:
+ vmovdqu 32(%r11),%xmm2
+ subq $6,%rdx
+ vpxor %xmm4,%xmm4,%xmm4
+ vmovdqu 0-128(%rcx),%xmm15
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovdqu %xmm4,16+8(%rsp)
+ jmp L$oop6x
+
+.p2align 5
+L$oop6x:
+ addl $100663296,%ebx
+ jc L$handle_ctr32
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm15,%xmm10,%xmm10
+ vpxor %xmm15,%xmm11,%xmm11
+
+L$resume_ctr32:
+ vmovdqu %xmm1,(%r8)
+ vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
+ vpxor %xmm15,%xmm12,%xmm12
+ vmovups 16-128(%rcx),%xmm2
+ vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
+ xorq %r12,%r12
+ cmpq %r14,%r15
+
+ vaesenc %xmm2,%xmm9,%xmm9
+ vmovdqu 48+8(%rsp),%xmm0
+ vpxor %xmm15,%xmm13,%xmm13
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
+ vaesenc %xmm2,%xmm10,%xmm10
+ vpxor %xmm15,%xmm14,%xmm14
+ setnc %r12b
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vmovdqu 16-32(%r9),%xmm3
+ negq %r12
+ vaesenc %xmm2,%xmm12,%xmm12
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
+ vpxor %xmm4,%xmm8,%xmm8
+ vaesenc %xmm2,%xmm13,%xmm13
+ vpxor %xmm5,%xmm1,%xmm4
+ andq $0x60,%r12
+ vmovups 32-128(%rcx),%xmm15
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
+ vaesenc %xmm2,%xmm14,%xmm14
+
+ vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
+ leaq (%r14,%r12,1),%r14
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
+ vmovdqu 64+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 88(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 80(%r14),%r12
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,32+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,40+8(%rsp)
+ vmovdqu 48-32(%r9),%xmm5
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 48-128(%rcx),%xmm15
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
+ vmovdqu 80+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqu 64-32(%r9),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 64-128(%rcx),%xmm15
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 72(%r14),%r13
+ vpxor %xmm5,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 64(%r14),%r12
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
+ vmovdqu 96+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,48+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,56+8(%rsp)
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 96-32(%r9),%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 80-128(%rcx),%xmm15
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 56(%r14),%r13
+ vpxor %xmm1,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
+ vpxor 112+8(%rsp),%xmm8,%xmm8
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 48(%r14),%r12
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,64+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,72+8(%rsp)
+ vpxor %xmm3,%xmm4,%xmm4
+ vmovdqu 112-32(%r9),%xmm3
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 96-128(%rcx),%xmm15
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 40(%r14),%r13
+ vpxor %xmm2,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 32(%r14),%r12
+ vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,80+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,88+8(%rsp)
+ vpxor %xmm5,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor %xmm1,%xmm6,%xmm6
+
+ vmovups 112-128(%rcx),%xmm15
+ vpslldq $8,%xmm6,%xmm5
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 16(%r11),%xmm3
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm8,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm5,%xmm4,%xmm4
+ movbeq 24(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 16(%r14),%r12
+ vpalignr $8,%xmm4,%xmm4,%xmm0
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ movq %r13,96+8(%rsp)
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r12,104+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ vmovups 128-128(%rcx),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 144-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm10,%xmm10
+ vpsrldq $8,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm11,%xmm11
+ vpxor %xmm6,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm12,%xmm12
+ vpxor %xmm0,%xmm4,%xmm4
+ movbeq 8(%r14),%r13
+ vaesenc %xmm1,%xmm13,%xmm13
+ movbeq 0(%r14),%r12
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 160-128(%rcx),%xmm1
+ cmpl $11,%ebp
+ jb L$enc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 176-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 192-128(%rcx),%xmm1
+ je L$enc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 208-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 224-128(%rcx),%xmm1
+ jmp L$enc_tail
+
+.p2align 5
+L$handle_ctr32:
+ vmovdqu (%r11),%xmm0
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm15,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm15,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpshufb %xmm0,%xmm1,%xmm1
+ jmp L$resume_ctr32
+
+.p2align 5
+L$enc_tail:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vmovdqu %xmm7,16+8(%rsp)
+ vpalignr $8,%xmm4,%xmm4,%xmm8
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ vpxor 0(%rdi),%xmm1,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 16(%rdi),%xmm1,%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 32(%rdi),%xmm1,%xmm5
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 48(%rdi),%xmm1,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 64(%rdi),%xmm1,%xmm7
+ vpxor 80(%rdi),%xmm1,%xmm3
+ vmovdqu (%r8),%xmm1
+
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vmovdqu 32(%r11),%xmm2
+ vaesenclast %xmm0,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm1,%xmm0
+ movq %r13,112+8(%rsp)
+ leaq 96(%rdi),%rdi
+ vaesenclast %xmm5,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm0,%xmm5
+ movq %r12,120+8(%rsp)
+ leaq 96(%rsi),%rsi
+ vmovdqu 0-128(%rcx),%xmm15
+ vaesenclast %xmm6,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm5,%xmm6
+ vaesenclast %xmm7,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm6,%xmm7
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vpaddb %xmm2,%xmm7,%xmm3
+
+ addq $0x60,%r10
+ subq $0x6,%rdx
+ jc L$6x_done
+
+ vmovups %xmm9,-96(%rsi)
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovups %xmm10,-80(%rsi)
+ vmovdqa %xmm0,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vmovdqa %xmm5,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vmovdqa %xmm6,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vmovdqa %xmm7,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vmovdqa %xmm3,%xmm14
+ vmovdqu 32+8(%rsp),%xmm7
+ jmp L$oop6x
+
+L$6x_done:
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpxor %xmm4,%xmm8,%xmm8
+
+ .byte 0xf3,0xc3
+
+.globl _aesni_gcm_decrypt
+
+.p2align 5
+_aesni_gcm_decrypt:
+ xorq %r10,%r10
+ cmpq $0x60,%rdx
+ jb L$gcm_dec_abort
+
+ leaq (%rsp),%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq L$bswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ vmovdqu (%r9),%xmm8
+ andq $-128,%rsp
+ vmovdqu (%r11),%xmm0
+ leaq 128(%rcx),%rcx
+ leaq 32+32(%r9),%r9
+ movl 240-128(%rcx),%ebp
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc L$dec_no_key_aliasing
+ cmpq $768,%r15
+ jnc L$dec_no_key_aliasing
+ subq %r15,%rsp
+L$dec_no_key_aliasing:
+
+ vmovdqu 80(%rdi),%xmm7
+ leaq (%rdi),%r14
+ vmovdqu 64(%rdi),%xmm4
+ leaq -192(%rdi,%rdx,1),%r15
+ vmovdqu 48(%rdi),%xmm5
+ shrq $4,%rdx
+ xorq %r10,%r10
+ vmovdqu 32(%rdi),%xmm6
+ vpshufb %xmm0,%xmm7,%xmm7
+ vmovdqu 16(%rdi),%xmm2
+ vpshufb %xmm0,%xmm4,%xmm4
+ vmovdqu (%rdi),%xmm3
+ vpshufb %xmm0,%xmm5,%xmm5
+ vmovdqu %xmm4,48(%rsp)
+ vpshufb %xmm0,%xmm6,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm2,%xmm2
+ vmovdqu %xmm6,80(%rsp)
+ vpshufb %xmm0,%xmm3,%xmm3
+ vmovdqu %xmm2,96(%rsp)
+ vmovdqu %xmm3,112(%rsp)
+
+ call _aesni_ctr32_ghash_6x
+
+ vmovups %xmm9,-96(%rsi)
+ vmovups %xmm10,-80(%rsi)
+ vmovups %xmm11,-64(%rsi)
+ vmovups %xmm12,-48(%rsi)
+ vmovups %xmm13,-32(%rsi)
+ vmovups %xmm14,-16(%rsi)
+
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+L$gcm_dec_abort:
+ movq %r10,%rax
+ .byte 0xf3,0xc3
+
+
+.p2align 5
+_aesni_ctr32_6x:
+ vmovdqu 0-128(%rcx),%xmm4
+ vmovdqu 32(%r11),%xmm2
+ leaq -1(%rbp),%r13
+ vmovups 16-128(%rcx),%xmm15
+ leaq 32-128(%rcx),%r12
+ vpxor %xmm4,%xmm1,%xmm9
+ addl $100663296,%ebx
+ jc L$handle_ctr32_2
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp L$oop_ctr32
+
+.p2align 4
+L$oop_ctr32:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+ vmovups (%r12),%xmm15
+ leaq 16(%r12),%r12
+ decl %r13d
+ jnz L$oop_ctr32
+
+ vmovdqu (%r12),%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 0(%rdi),%xmm3,%xmm4
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor 16(%rdi),%xmm3,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 32(%rdi),%xmm3,%xmm6
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 48(%rdi),%xmm3,%xmm8
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 64(%rdi),%xmm3,%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 80(%rdi),%xmm3,%xmm3
+ leaq 96(%rdi),%rdi
+
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm5,%xmm10,%xmm10
+ vaesenclast %xmm6,%xmm11,%xmm11
+ vaesenclast %xmm8,%xmm12,%xmm12
+ vaesenclast %xmm2,%xmm13,%xmm13
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vmovups %xmm9,0(%rsi)
+ vmovups %xmm10,16(%rsi)
+ vmovups %xmm11,32(%rsi)
+ vmovups %xmm12,48(%rsi)
+ vmovups %xmm13,64(%rsi)
+ vmovups %xmm14,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ .byte 0xf3,0xc3
+.p2align 5
+L$handle_ctr32_2:
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp L$oop_ctr32
+
+
+.globl _aesni_gcm_encrypt
+
+.p2align 5
+_aesni_gcm_encrypt:
+ xorq %r10,%r10
+ cmpq $288,%rdx
+ jb L$gcm_enc_abort
+
+ leaq (%rsp),%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq L$bswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ leaq 128(%rcx),%rcx
+ vmovdqu (%r11),%xmm0
+ andq $-128,%rsp
+ movl 240-128(%rcx),%ebp
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc L$enc_no_key_aliasing
+ cmpq $768,%r15
+ jnc L$enc_no_key_aliasing
+ subq %r15,%rsp
+L$enc_no_key_aliasing:
+
+ leaq (%rsi),%r14
+ leaq -192(%rsi,%rdx,1),%r15
+ shrq $4,%rdx
+
+ call _aesni_ctr32_6x
+ vpshufb %xmm0,%xmm9,%xmm8
+ vpshufb %xmm0,%xmm10,%xmm2
+ vmovdqu %xmm8,112(%rsp)
+ vpshufb %xmm0,%xmm11,%xmm4
+ vmovdqu %xmm2,96(%rsp)
+ vpshufb %xmm0,%xmm12,%xmm5
+ vmovdqu %xmm4,80(%rsp)
+ vpshufb %xmm0,%xmm13,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm14,%xmm7
+ vmovdqu %xmm6,48(%rsp)
+
+ call _aesni_ctr32_6x
+
+ vmovdqu (%r9),%xmm8
+ leaq 32+32(%r9),%r9
+ subq $12,%rdx
+ movq $192,%r10
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ call _aesni_ctr32_ghash_6x
+ vmovdqu 32(%rsp),%xmm7
+ vmovdqu (%r11),%xmm0
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm7,%xmm7,%xmm1
+ vmovdqu 32-32(%r9),%xmm15
+ vmovups %xmm9,-96(%rsi)
+ vpshufb %xmm0,%xmm9,%xmm9
+ vpxor %xmm7,%xmm1,%xmm1
+ vmovups %xmm10,-80(%rsi)
+ vpshufb %xmm0,%xmm10,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vpshufb %xmm0,%xmm11,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vpshufb %xmm0,%xmm12,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vpshufb %xmm0,%xmm13,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vpshufb %xmm0,%xmm14,%xmm14
+ vmovdqu %xmm9,16(%rsp)
+ vmovdqu 48(%rsp),%xmm6
+ vmovdqu 16-32(%r9),%xmm0
+ vpunpckhqdq %xmm6,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
+ vpxor %xmm6,%xmm2,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+
+ vmovdqu 64(%rsp),%xmm9
+ vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm9,%xmm9,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
+ vpxor %xmm9,%xmm5,%xmm5
+ vpxor %xmm7,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vmovdqu 80(%rsp),%xmm1
+ vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm4,%xmm7,%xmm7
+ vpunpckhqdq %xmm1,%xmm1,%xmm4
+ vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpxor %xmm6,%xmm9,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 96(%rsp),%xmm2
+ vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm7,%xmm6,%xmm6
+ vpunpckhqdq %xmm2,%xmm2,%xmm7
+ vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpxor %xmm9,%xmm1,%xmm1
+ vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm5,%xmm4,%xmm4
+
+ vpxor 112(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
+ vmovdqu 112-32(%r9),%xmm0
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm1,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
+ vpxor %xmm4,%xmm7,%xmm4
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm1
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
+ vpxor %xmm14,%xmm1,%xmm1
+ vpxor %xmm5,%xmm6,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
+ vmovdqu 32-32(%r9),%xmm15
+ vpxor %xmm2,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm6
+
+ vmovdqu 16-32(%r9),%xmm0
+ vpxor %xmm5,%xmm7,%xmm9
+ vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
+ vpxor %xmm9,%xmm6,%xmm6
+ vpunpckhqdq %xmm13,%xmm13,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
+ vpxor %xmm13,%xmm2,%xmm2
+ vpslldq $8,%xmm6,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+ vpxor %xmm9,%xmm5,%xmm8
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm12,%xmm12,%xmm9
+ vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
+ vpxor %xmm12,%xmm9,%xmm9
+ vpxor %xmm14,%xmm13,%xmm13
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm11,%xmm11,%xmm1
+ vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
+ vpxor %xmm11,%xmm1,%xmm1
+ vpxor %xmm13,%xmm12,%xmm12
+ vxorps 16(%rsp),%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm9,%xmm9
+
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm10,%xmm10,%xmm2
+ vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
+ vpxor %xmm10,%xmm2,%xmm2
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpxor %xmm12,%xmm11,%xmm11
+ vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vxorps %xmm7,%xmm14,%xmm14
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
+ vmovdqu 112-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm11,%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
+ vpxor %xmm4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
+ vpxor %xmm10,%xmm7,%xmm7
+ vpxor %xmm2,%xmm6,%xmm6
+
+ vpxor %xmm5,%xmm7,%xmm4
+ vpxor %xmm4,%xmm6,%xmm6
+ vpslldq $8,%xmm6,%xmm1
+ vmovdqu 16(%r11),%xmm3
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm1,%xmm5,%xmm8
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm2,%xmm8,%xmm8
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm7,%xmm2,%xmm2
+ vpxor %xmm2,%xmm8,%xmm8
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+L$gcm_enc_abort:
+ movq %r10,%rax
+ .byte 0xf3,0xc3
+
+.p2align 6
+L$bswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+L$poly:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+L$one_msb:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+L$two_lsb:
+.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+L$one_lsb:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align 6
+
leal 16(%edx),%edx
jnz L000enc1_loop_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.globl _aesni_decrypt
.align 4
leal 16(%edx),%edx
jnz L001dec1_loop_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
+ ret
+.align 4
+__aesni_encrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+L002enc2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L002enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ ret
+.align 4
+__aesni_decrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+L003dec2_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L003dec2_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
ret
.align 4
__aesni_encrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-L002enc3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+L004enc3_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
- movups (%edx),%xmm0
- jnz L002enc3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L004enc3_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.align 4
__aesni_decrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-L003dec3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+L005dec3_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
- movups (%edx),%xmm0
- jnz L003dec3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L005dec3_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
__aesni_encrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-L004enc4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+L006enc4_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%edx),%xmm0
- jnz L004enc4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L006enc4_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
__aesni_decrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-L005dec4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+L007dec4_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%edx),%xmm0
- jnz L005dec4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L007dec4_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.align 4
__aesni_encrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
-.byte 102,15,56,220,241
- movups (%edx),%xmm0
-.byte 102,15,56,220,249
- jmp L_aesni_encrypt6_enter
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp L008_aesni_encrypt6_inner
.align 4,0x90
-L006enc6_loop:
+L009enc6_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
+L008_aesni_encrypt6_inner:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.align 4,0x90
L_aesni_encrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%edx),%xmm0
- jnz L006enc6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L009enc6_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.align 4
__aesni_decrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
-.byte 102,15,56,222,241
- movups (%edx),%xmm0
-.byte 102,15,56,222,249
- jmp L_aesni_decrypt6_enter
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp L010_aesni_decrypt6_inner
.align 4,0x90
-L007dec6_loop:
+L011dec6_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
+L010_aesni_decrypt6_inner:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.align 4,0x90
L_aesni_decrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%edx),%xmm0
- jnz L007dec6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L011dec6_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
movl 32(%esp),%edx
movl 36(%esp),%ebx
andl $-16,%eax
- jz L008ecb_ret
+ jz L012ecb_ret
movl 240(%edx),%ecx
testl %ebx,%ebx
- jz L009ecb_decrypt
+ jz L013ecb_decrypt
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb L010ecb_enc_tail
+ jb L014ecb_enc_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp L011ecb_enc_loop6_enter
+ jmp L015ecb_enc_loop6_enter
.align 4,0x90
-L012ecb_enc_loop6:
+L016ecb_enc_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-L011ecb_enc_loop6_enter:
+L015ecb_enc_loop6_enter:
call __aesni_encrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc L012ecb_enc_loop6
+ jnc L016ecb_enc_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz L008ecb_ret
-L010ecb_enc_tail:
+ jz L012ecb_ret
+L014ecb_enc_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb L013ecb_enc_one
+ jb L017ecb_enc_one
movups 16(%esi),%xmm3
- je L014ecb_enc_two
+ je L018ecb_enc_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb L015ecb_enc_three
+ jb L019ecb_enc_three
movups 48(%esi),%xmm5
- je L016ecb_enc_four
+ je L020ecb_enc_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_encrypt6
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L013ecb_enc_one:
+L017ecb_enc_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L017enc1_loop_3:
+L021enc1_loop_3:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L017enc1_loop_3
+ jnz L021enc1_loop_3
.byte 102,15,56,221,209
movups %xmm2,(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L014ecb_enc_two:
- xorps %xmm4,%xmm4
- call __aesni_encrypt3
+L018ecb_enc_two:
+ call __aesni_encrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L015ecb_enc_three:
+L019ecb_enc_three:
call __aesni_encrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L016ecb_enc_four:
+L020ecb_enc_four:
call __aesni_encrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L009ecb_decrypt:
+L013ecb_decrypt:
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb L018ecb_dec_tail
+ jb L022ecb_dec_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp L019ecb_dec_loop6_enter
+ jmp L023ecb_dec_loop6_enter
.align 4,0x90
-L020ecb_dec_loop6:
+L024ecb_dec_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-L019ecb_dec_loop6_enter:
+L023ecb_dec_loop6_enter:
call __aesni_decrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc L020ecb_dec_loop6
+ jnc L024ecb_dec_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz L008ecb_ret
-L018ecb_dec_tail:
+ jz L012ecb_ret
+L022ecb_dec_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb L021ecb_dec_one
+ jb L025ecb_dec_one
movups 16(%esi),%xmm3
- je L022ecb_dec_two
+ je L026ecb_dec_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb L023ecb_dec_three
+ jb L027ecb_dec_three
movups 48(%esi),%xmm5
- je L024ecb_dec_four
+ je L028ecb_dec_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_decrypt6
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L021ecb_dec_one:
+L025ecb_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L025dec1_loop_4:
+L029dec1_loop_4:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L025dec1_loop_4
+ jnz L029dec1_loop_4
.byte 102,15,56,223,209
movups %xmm2,(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L022ecb_dec_two:
- xorps %xmm4,%xmm4
- call __aesni_decrypt3
+L026ecb_dec_two:
+ call __aesni_decrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L023ecb_dec_three:
+L027ecb_dec_three:
call __aesni_decrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L024ecb_dec_four:
+L028ecb_dec_four:
call __aesni_decrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-L008ecb_ret:
+L012ecb_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
movl %ebp,20(%esp)
movl %ebp,24(%esp)
movl %ebp,28(%esp)
- shrl $1,%ecx
+ shll $4,%ecx
+ movl $16,%ebx
leal (%edx),%ebp
movdqa (%esp),%xmm5
movdqa %xmm7,%xmm2
- movl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ subl %ecx,%ebx
.byte 102,15,56,0,253
-L026ccm64_enc_outer:
+L030ccm64_enc_outer:
movups (%ebp),%xmm0
movl %ebx,%ecx
movups (%esi),%xmm6
xorps %xmm0,%xmm2
movups 16(%ebp),%xmm1
xorps %xmm6,%xmm0
- leal 32(%ebp),%edx
xorps %xmm0,%xmm3
- movups (%edx),%xmm0
-L027ccm64_enc2_loop:
+ movups 32(%ebp),%xmm0
+L031ccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz L027ccm64_enc2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L031ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
paddq 16(%esp),%xmm7
+ decl %eax
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decl %eax
leal 16(%esi),%esi
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
movups %xmm6,(%edi)
- leal 16(%edi),%edi
.byte 102,15,56,0,213
- jnz L026ccm64_enc_outer
+ leal 16(%edi),%edi
+ jnz L030ccm64_enc_outer
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L028enc1_loop_5:
+L032enc1_loop_5:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L028enc1_loop_5
+ jnz L032enc1_loop_5
.byte 102,15,56,221,209
+ shll $4,%ebx
+ movl $16,%ecx
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
leal 16(%esi),%esi
- jmp L029ccm64_dec_outer
+ subl %ebx,%ecx
+ leal 32(%ebp,%ebx,1),%edx
+ movl %ecx,%ebx
+ jmp L033ccm64_dec_outer
.align 4,0x90
-L029ccm64_dec_outer:
+L033ccm64_dec_outer:
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
- movl %ebx,%ecx
movups %xmm6,(%edi)
leal 16(%edi),%edi
.byte 102,15,56,0,213
subl $1,%eax
- jz L030ccm64_dec_break
+ jz L034ccm64_dec_break
movups (%ebp),%xmm0
- shrl $1,%ecx
+ movl %ebx,%ecx
movups 16(%ebp),%xmm1
xorps %xmm0,%xmm6
- leal 32(%ebp),%edx
xorps %xmm0,%xmm2
xorps %xmm6,%xmm3
- movups (%edx),%xmm0
-L031ccm64_dec2_loop:
+ movups 32(%ebp),%xmm0
+L035ccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz L031ccm64_dec2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L035ccm64_dec2_loop
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leal 16(%esi),%esi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- jmp L029ccm64_dec_outer
+ leal 16(%esi),%esi
+ jmp L033ccm64_dec_outer
.align 4,0x90
-L030ccm64_dec_break:
+L034ccm64_dec_break:
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movups (%edx),%xmm0
movups 16(%edx),%xmm1
xorps %xmm0,%xmm6
leal 32(%edx),%edx
xorps %xmm6,%xmm3
-L032enc1_loop_6:
+L036enc1_loop_6:
.byte 102,15,56,220,217
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L032enc1_loop_6
+ jnz L036enc1_loop_6
.byte 102,15,56,221,217
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
andl $-16,%esp
movl %ebp,80(%esp)
cmpl $1,%eax
- je L033ctr32_one_shortcut
+ je L037ctr32_one_shortcut
movdqu (%ebx),%xmm7
movl $202182159,(%esp)
movl $134810123,4(%esp)
.byte 102,15,58,34,253,3
movl 240(%edx),%ecx
bswap %ebx
- pxor %xmm1,%xmm1
pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movdqa (%esp),%xmm2
-.byte 102,15,58,34,203,0
+.byte 102,15,58,34,195,0
leal 3(%ebx),%ebp
-.byte 102,15,58,34,197,0
+.byte 102,15,58,34,205,0
incl %ebx
-.byte 102,15,58,34,203,1
+.byte 102,15,58,34,195,1
incl %ebp
-.byte 102,15,58,34,197,1
+.byte 102,15,58,34,205,1
incl %ebx
-.byte 102,15,58,34,203,2
+.byte 102,15,58,34,195,2
incl %ebp
-.byte 102,15,58,34,197,2
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
- movdqa %xmm0,64(%esp)
+.byte 102,15,58,34,205,2
+ movdqa %xmm0,48(%esp)
.byte 102,15,56,0,194
- pshufd $192,%xmm1,%xmm2
- pshufd $128,%xmm1,%xmm3
+ movdqu (%edx),%xmm6
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
+ pshufd $192,%xmm0,%xmm2
+ pshufd $128,%xmm0,%xmm3
cmpl $6,%eax
- jb L034ctr32_tail
+ jb L038ctr32_tail
+ pxor %xmm6,%xmm7
+ shll $4,%ecx
+ movl $16,%ebx
movdqa %xmm7,32(%esp)
- shrl $1,%ecx
movl %edx,%ebp
- movl %ecx,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
subl $6,%eax
- jmp L035ctr32_loop6
+ jmp L039ctr32_loop6
.align 4,0x90
-L035ctr32_loop6:
- pshufd $64,%xmm1,%xmm4
- movdqa 32(%esp),%xmm1
- pshufd $192,%xmm0,%xmm5
- por %xmm1,%xmm2
- pshufd $128,%xmm0,%xmm6
- por %xmm1,%xmm3
- pshufd $64,%xmm0,%xmm7
- por %xmm1,%xmm4
- por %xmm1,%xmm5
- por %xmm1,%xmm6
- por %xmm1,%xmm7
- movups (%ebp),%xmm0
- movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
- decl %ecx
+L039ctr32_loop6:
+ pshufd $64,%xmm0,%xmm4
+ movdqa 32(%esp),%xmm0
+ pshufd $192,%xmm1,%xmm5
pxor %xmm0,%xmm2
+ pshufd $128,%xmm1,%xmm6
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
+ pshufd $64,%xmm1,%xmm7
+ movups 16(%ebp),%xmm1
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
+.byte 102,15,56,220,209
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
pxor %xmm0,%xmm7
+.byte 102,15,56,220,217
+ movups 32(%ebp),%xmm0
+ movl %ebx,%ecx
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call L_aesni_encrypt6_enter
movups (%esi),%xmm1
movups %xmm2,(%edi)
movdqa 16(%esp),%xmm0
xorps %xmm1,%xmm4
- movdqa 48(%esp),%xmm1
+ movdqa 64(%esp),%xmm1
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
paddd %xmm0,%xmm1
- paddd 64(%esp),%xmm0
+ paddd 48(%esp),%xmm0
movdqa (%esp),%xmm2
movups 48(%esi),%xmm3
movups 64(%esi),%xmm4
xorps %xmm3,%xmm5
movups 80(%esi),%xmm3
leal 96(%esi),%esi
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
+ movdqa %xmm0,48(%esp)
+.byte 102,15,56,0,194
xorps %xmm4,%xmm6
movups %xmm5,48(%edi)
xorps %xmm3,%xmm7
- movdqa %xmm0,64(%esp)
-.byte 102,15,56,0,194
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
movups %xmm6,64(%edi)
- pshufd $192,%xmm1,%xmm2
+ pshufd $192,%xmm0,%xmm2
movups %xmm7,80(%edi)
leal 96(%edi),%edi
- movl %ebx,%ecx
- pshufd $128,%xmm1,%xmm3
+ pshufd $128,%xmm0,%xmm3
subl $6,%eax
- jnc L035ctr32_loop6
+ jnc L039ctr32_loop6
addl $6,%eax
- jz L036ctr32_ret
+ jz L040ctr32_ret
+ movdqu (%ebp),%xmm7
movl %ebp,%edx
- leal 1(,%ecx,2),%ecx
- movdqa 32(%esp),%xmm7
-L034ctr32_tail:
+ pxor 32(%esp),%xmm7
+ movl 240(%ebp),%ecx
+L038ctr32_tail:
por %xmm7,%xmm2
cmpl $2,%eax
- jb L037ctr32_one
- pshufd $64,%xmm1,%xmm4
+ jb L041ctr32_one
+ pshufd $64,%xmm0,%xmm4
por %xmm7,%xmm3
- je L038ctr32_two
- pshufd $192,%xmm0,%xmm5
+ je L042ctr32_two
+ pshufd $192,%xmm1,%xmm5
por %xmm7,%xmm4
cmpl $4,%eax
- jb L039ctr32_three
- pshufd $128,%xmm0,%xmm6
+ jb L043ctr32_three
+ pshufd $128,%xmm1,%xmm6
por %xmm7,%xmm5
- je L040ctr32_four
+ je L044ctr32_four
por %xmm7,%xmm6
call __aesni_encrypt6
movups (%esi),%xmm1
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L036ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L033ctr32_one_shortcut:
+L037ctr32_one_shortcut:
movups (%ebx),%xmm2
movl 240(%edx),%ecx
-L037ctr32_one:
+L041ctr32_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L041enc1_loop_7:
+L045enc1_loop_7:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L041enc1_loop_7
+ jnz L045enc1_loop_7
.byte 102,15,56,221,209
movups (%esi),%xmm6
xorps %xmm2,%xmm6
movups %xmm6,(%edi)
- jmp L036ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L038ctr32_two:
- call __aesni_encrypt3
+L042ctr32_two:
+ call __aesni_encrypt2
movups (%esi),%xmm5
movups 16(%esi),%xmm6
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L036ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L039ctr32_three:
+L043ctr32_three:
call __aesni_encrypt3
movups (%esi),%xmm5
movups 16(%esi),%xmm6
xorps %xmm7,%xmm4
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L036ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L040ctr32_four:
+L044ctr32_four:
call __aesni_encrypt4
movups (%esi),%xmm6
movups 16(%esi),%xmm7
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-L036ctr32_ret:
+L040ctr32_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
movl 80(%esp),%esp
popl %edi
popl %esi
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L042enc1_loop_8:
+L046enc1_loop_8:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L042enc1_loop_8
+ jnz L046enc1_loop_8
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
movl %edx,%ebp
movl %ecx,%ebx
subl $96,%eax
- jc L043xts_enc_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp L044xts_enc_loop6
+ jc L047xts_enc_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp L048xts_enc_loop6
.align 4,0x90
-L044xts_enc_loop6:
+L048xts_enc_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,220,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call L_aesni_encrypt6_enter
movdqa 80(%esp),%xmm1
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc L044xts_enc_loop6
- leal 1(,%ecx,2),%ecx
+ jnc L048xts_enc_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-L043xts_enc_short:
+L047xts_enc_short:
addl $96,%eax
- jz L045xts_enc_done6x
+ jz L049xts_enc_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb L046xts_enc_one
+ jb L050xts_enc_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je L047xts_enc_two
+ je L051xts_enc_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb L048xts_enc_three
+ jb L052xts_enc_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je L049xts_enc_four
+ je L053xts_enc_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp L050xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L046xts_enc_one:
+L050xts_enc_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L051enc1_loop_9:
+L055enc1_loop_9:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L051enc1_loop_9
+ jnz L055enc1_loop_9
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp L050xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L047xts_enc_two:
+L051xts_enc_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- xorps %xmm4,%xmm4
- call __aesni_encrypt3
+ call __aesni_encrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L050xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L048xts_enc_three:
+L052xts_enc_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp L050xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L049xts_enc_four:
+L053xts_enc_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L050xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L045xts_enc_done6x:
+L049xts_enc_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz L052xts_enc_ret
+ jz L056xts_enc_ret
movdqa %xmm1,%xmm5
movl %eax,112(%esp)
- jmp L053xts_enc_steal
+ jmp L057xts_enc_steal
.align 4,0x90
-L050xts_enc_done:
+L054xts_enc_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz L052xts_enc_ret
+ jz L056xts_enc_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm5
paddq %xmm1,%xmm1
pand 96(%esp),%xmm5
pxor %xmm1,%xmm5
-L053xts_enc_steal:
+L057xts_enc_steal:
movzbl (%esi),%ecx
movzbl -16(%edi),%edx
leal 1(%esi),%esi
movb %dl,(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz L053xts_enc_steal
+ jnz L057xts_enc_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L054enc1_loop_10:
+L058enc1_loop_10:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L054enc1_loop_10
+ jnz L058enc1_loop_10
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,-16(%edi)
-L052xts_enc_ret:
+L056xts_enc_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L055enc1_loop_11:
+L059enc1_loop_11:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L055enc1_loop_11
+ jnz L059enc1_loop_11
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
pcmpgtd %xmm1,%xmm0
andl $-16,%eax
subl $96,%eax
- jc L056xts_dec_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp L057xts_dec_loop6
+ jc L060xts_dec_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp L061xts_dec_loop6
.align 4,0x90
-L057xts_dec_loop6:
+L061xts_dec_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,222,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
.byte 102,15,56,222,241
- movups (%edx),%xmm0
.byte 102,15,56,222,249
call L_aesni_decrypt6_enter
movdqa 80(%esp),%xmm1
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc L057xts_dec_loop6
- leal 1(,%ecx,2),%ecx
+ jnc L061xts_dec_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-L056xts_dec_short:
+L060xts_dec_short:
addl $96,%eax
- jz L058xts_dec_done6x
+ jz L062xts_dec_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb L059xts_dec_one
+ jb L063xts_dec_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je L060xts_dec_two
+ je L064xts_dec_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb L061xts_dec_three
+ jb L065xts_dec_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je L062xts_dec_four
+ je L066xts_dec_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp L063xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L059xts_dec_one:
+L063xts_dec_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L064dec1_loop_12:
+L068dec1_loop_12:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L064dec1_loop_12
+ jnz L068dec1_loop_12
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp L063xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L060xts_dec_two:
+L064xts_dec_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- call __aesni_decrypt3
+ call __aesni_decrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L063xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L061xts_dec_three:
+L065xts_dec_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp L063xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L062xts_dec_four:
+L066xts_dec_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L063xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L058xts_dec_done6x:
+L062xts_dec_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz L065xts_dec_ret
+ jz L069xts_dec_ret
movl %eax,112(%esp)
- jmp L066xts_dec_only_one_more
+ jmp L070xts_dec_only_one_more
.align 4,0x90
-L063xts_dec_done:
+L067xts_dec_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz L065xts_dec_ret
+ jz L069xts_dec_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm2
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
-L066xts_dec_only_one_more:
+L070xts_dec_only_one_more:
pshufd $19,%xmm0,%xmm5
movdqa %xmm1,%xmm6
paddq %xmm1,%xmm1
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L067dec1_loop_13:
+L071dec1_loop_13:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L067dec1_loop_13
+ jnz L071dec1_loop_13
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
-L068xts_dec_steal:
+L072xts_dec_steal:
movzbl 16(%esi),%ecx
movzbl (%edi),%edx
leal 1(%esi),%esi
movb %dl,16(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz L068xts_dec_steal
+ jnz L072xts_dec_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L069dec1_loop_14:
+L073dec1_loop_14:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L069dec1_loop_14
+ jnz L073dec1_loop_14
.byte 102,15,56,223,209
xorps %xmm6,%xmm2
movups %xmm2,(%edi)
-L065xts_dec_ret:
+L069xts_dec_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
movl 32(%esp),%edx
movl 36(%esp),%ebp
testl %eax,%eax
- jz L070cbc_abort
+ jz L074cbc_abort
cmpl $0,40(%esp)
xchgl %esp,%ebx
movups (%ebp),%xmm7
movl %edx,%ebp
movl %ebx,16(%esp)
movl %ecx,%ebx
- je L071cbc_decrypt
+ je L075cbc_decrypt
movaps %xmm7,%xmm2
cmpl $16,%eax
- jb L072cbc_enc_tail
+ jb L076cbc_enc_tail
subl $16,%eax
- jmp L073cbc_enc_loop
+ jmp L077cbc_enc_loop
.align 4,0x90
-L073cbc_enc_loop:
+L077cbc_enc_loop:
movups (%esi),%xmm7
leal 16(%esi),%esi
movups (%edx),%xmm0
xorps %xmm0,%xmm7
leal 32(%edx),%edx
xorps %xmm7,%xmm2
-L074enc1_loop_15:
+L078enc1_loop_15:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L074enc1_loop_15
+ jnz L078enc1_loop_15
.byte 102,15,56,221,209
movl %ebx,%ecx
movl %ebp,%edx
movups %xmm2,(%edi)
leal 16(%edi),%edi
subl $16,%eax
- jnc L073cbc_enc_loop
+ jnc L077cbc_enc_loop
addl $16,%eax
- jnz L072cbc_enc_tail
+ jnz L076cbc_enc_tail
movaps %xmm2,%xmm7
- jmp L075cbc_ret
-L072cbc_enc_tail:
+ pxor %xmm2,%xmm2
+ jmp L079cbc_ret
+L076cbc_enc_tail:
movl %eax,%ecx
.long 2767451785
movl $16,%ecx
movl %ebx,%ecx
movl %edi,%esi
movl %ebp,%edx
- jmp L073cbc_enc_loop
+ jmp L077cbc_enc_loop
.align 4,0x90
-L071cbc_decrypt:
+L075cbc_decrypt:
cmpl $80,%eax
- jbe L076cbc_dec_tail
+ jbe L080cbc_dec_tail
movaps %xmm7,(%esp)
subl $80,%eax
- jmp L077cbc_dec_loop6_enter
+ jmp L081cbc_dec_loop6_enter
.align 4,0x90
-L078cbc_dec_loop6:
+L082cbc_dec_loop6:
movaps %xmm0,(%esp)
movups %xmm7,(%edi)
leal 16(%edi),%edi
-L077cbc_dec_loop6_enter:
+L081cbc_dec_loop6_enter:
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
movups %xmm6,64(%edi)
leal 80(%edi),%edi
subl $96,%eax
- ja L078cbc_dec_loop6
+ ja L082cbc_dec_loop6
movaps %xmm7,%xmm2
movaps %xmm0,%xmm7
addl $80,%eax
- jle L079cbc_dec_tail_collected
+ jle L083cbc_dec_clear_tail_collected
movups %xmm2,(%edi)
leal 16(%edi),%edi
-L076cbc_dec_tail:
+L080cbc_dec_tail:
movups (%esi),%xmm2
movaps %xmm2,%xmm6
cmpl $16,%eax
- jbe L080cbc_dec_one
+ jbe L084cbc_dec_one
movups 16(%esi),%xmm3
movaps %xmm3,%xmm5
cmpl $32,%eax
- jbe L081cbc_dec_two
+ jbe L085cbc_dec_two
movups 32(%esi),%xmm4
cmpl $48,%eax
- jbe L082cbc_dec_three
+ jbe L086cbc_dec_three
movups 48(%esi),%xmm5
cmpl $64,%eax
- jbe L083cbc_dec_four
+ jbe L087cbc_dec_four
movups 64(%esi),%xmm6
movaps %xmm7,(%esp)
movups (%esi),%xmm2
xorps %xmm0,%xmm6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%edi)
+ pxor %xmm5,%xmm5
leal 64(%edi),%edi
movaps %xmm6,%xmm2
+ pxor %xmm6,%xmm6
subl $80,%eax
- jmp L079cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L080cbc_dec_one:
+L084cbc_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L084dec1_loop_16:
+L089dec1_loop_16:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L084dec1_loop_16
+ jnz L089dec1_loop_16
.byte 102,15,56,223,209
xorps %xmm7,%xmm2
movaps %xmm6,%xmm7
subl $16,%eax
- jmp L079cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L081cbc_dec_two:
- xorps %xmm4,%xmm4
- call __aesni_decrypt3
+L085cbc_dec_two:
+ call __aesni_decrypt2
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movaps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leal 16(%edi),%edi
movaps %xmm5,%xmm7
subl $32,%eax
- jmp L079cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L082cbc_dec_three:
+L086cbc_dec_three:
call __aesni_decrypt3
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
xorps %xmm5,%xmm4
movups %xmm2,(%edi)
movaps %xmm4,%xmm2
+ pxor %xmm4,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
leal 32(%edi),%edi
movups 32(%esi),%xmm7
subl $48,%eax
- jmp L079cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L083cbc_dec_four:
+L087cbc_dec_four:
call __aesni_decrypt4
movups 16(%esi),%xmm1
movups 32(%esi),%xmm0
movups %xmm2,(%edi)
xorps %xmm1,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
leal 48(%edi),%edi
movaps %xmm5,%xmm2
+ pxor %xmm5,%xmm5
subl $64,%eax
-L079cbc_dec_tail_collected:
+ jmp L088cbc_dec_tail_collected
+.align 4,0x90
+L083cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+L088cbc_dec_tail_collected:
andl $15,%eax
- jnz L085cbc_dec_tail_partial
+ jnz L090cbc_dec_tail_partial
movups %xmm2,(%edi)
- jmp L075cbc_ret
+ pxor %xmm0,%xmm0
+ jmp L079cbc_ret
.align 4,0x90
-L085cbc_dec_tail_partial:
+L090cbc_dec_tail_partial:
movaps %xmm2,(%esp)
+ pxor %xmm0,%xmm0
movl $16,%ecx
movl %esp,%esi
subl %eax,%ecx
.long 2767451785
-L075cbc_ret:
+ movdqa %xmm2,(%esp)
+L079cbc_ret:
movl 16(%esp),%esp
movl 36(%esp),%ebp
+ pxor %xmm2,%xmm2
+ pxor %xmm1,%xmm1
movups %xmm7,(%ebp)
-L070cbc_abort:
+ pxor %xmm7,%xmm7
+L074cbc_abort:
popl %edi
popl %esi
popl %ebx
ret
.align 4
__aesni_set_encrypt_key:
+ pushl %ebp
+ pushl %ebx
testl %eax,%eax
- jz L086bad_pointer
+ jz L091bad_pointer
testl %edx,%edx
- jz L086bad_pointer
+ jz L091bad_pointer
+ call L092pic
+L092pic:
+ popl %ebx
+ leal Lkey_const-L092pic(%ebx),%ebx
+ movl L__gnutls_x86_cpuid_s$non_lazy_ptr-Lkey_const(%ebx),%ebp
movups (%eax),%xmm0
xorps %xmm4,%xmm4
+ movl 4(%ebp),%ebp
leal 16(%edx),%edx
+ andl $268437504,%ebp
cmpl $256,%ecx
- je L08714rounds
+ je L09314rounds
cmpl $192,%ecx
- je L08812rounds
+ je L09412rounds
cmpl $128,%ecx
- jne L089bad_keybits
+ jne L095bad_keybits
.align 4,0x90
-L09010rounds:
+L09610rounds:
+ cmpl $268435456,%ebp
+ je L09710rounds_alt
movl $9,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,200,1
- call L091key_128_cold
+ call L098key_128_cold
.byte 102,15,58,223,200,2
- call L092key_128
+ call L099key_128
.byte 102,15,58,223,200,4
- call L092key_128
+ call L099key_128
.byte 102,15,58,223,200,8
- call L092key_128
+ call L099key_128
.byte 102,15,58,223,200,16
- call L092key_128
+ call L099key_128
.byte 102,15,58,223,200,32
- call L092key_128
+ call L099key_128
.byte 102,15,58,223,200,64
- call L092key_128
+ call L099key_128
.byte 102,15,58,223,200,128
- call L092key_128
+ call L099key_128
.byte 102,15,58,223,200,27
- call L092key_128
+ call L099key_128
.byte 102,15,58,223,200,54
- call L092key_128
+ call L099key_128
movups %xmm0,(%edx)
movl %ecx,80(%edx)
- xorl %eax,%eax
- ret
+ jmp L100good_key
.align 4,0x90
-L092key_128:
+L099key_128:
movups %xmm0,(%edx)
leal 16(%edx),%edx
-L091key_128_cold:
+L098key_128_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
xorps %xmm1,%xmm0
ret
.align 4,0x90
-L08812rounds:
+L09710rounds_alt:
+ movdqa (%ebx),%xmm5
+ movl $8,%ecx
+ movdqa 32(%ebx),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,-16(%edx)
+L101loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leal 16(%edx),%edx
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%edx)
+ movdqa %xmm0,%xmm2
+ decl %ecx
+ jnz L101loop_key128
+ movdqa 48(%ebx),%xmm4
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%edx)
+ movl $9,%ecx
+ movl %ecx,96(%edx)
+ jmp L100good_key
+.align 4,0x90
+L09412rounds:
movq 16(%eax),%xmm2
+ cmpl $268435456,%ebp
+ je L10212rounds_alt
movl $11,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,202,1
- call L093key_192a_cold
+ call L103key_192a_cold
.byte 102,15,58,223,202,2
- call L094key_192b
+ call L104key_192b
.byte 102,15,58,223,202,4
- call L095key_192a
+ call L105key_192a
.byte 102,15,58,223,202,8
- call L094key_192b
+ call L104key_192b
.byte 102,15,58,223,202,16
- call L095key_192a
+ call L105key_192a
.byte 102,15,58,223,202,32
- call L094key_192b
+ call L104key_192b
.byte 102,15,58,223,202,64
- call L095key_192a
+ call L105key_192a
.byte 102,15,58,223,202,128
- call L094key_192b
+ call L104key_192b
movups %xmm0,(%edx)
movl %ecx,48(%edx)
- xorl %eax,%eax
- ret
+ jmp L100good_key
.align 4,0x90
-L095key_192a:
+L105key_192a:
movups %xmm0,(%edx)
leal 16(%edx),%edx
.align 4,0x90
-L093key_192a_cold:
+L103key_192a_cold:
movaps %xmm2,%xmm5
-L096key_192b_warm:
+L106key_192b_warm:
shufps $16,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
pxor %xmm3,%xmm2
ret
.align 4,0x90
-L094key_192b:
+L104key_192b:
movaps %xmm0,%xmm3
shufps $68,%xmm0,%xmm5
movups %xmm5,(%edx)
shufps $78,%xmm2,%xmm3
movups %xmm3,16(%edx)
leal 32(%edx),%edx
- jmp L096key_192b_warm
+ jmp L106key_192b_warm
.align 4,0x90
-L08714rounds:
+L10212rounds_alt:
+ movdqa 16(%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $8,%ecx
+ movdqu %xmm0,-16(%edx)
+L107loop_key192:
+ movq %xmm2,(%edx)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leal 24(%edx),%edx
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%edx)
+ decl %ecx
+ jnz L107loop_key192
+ movl $11,%ecx
+ movl %ecx,32(%edx)
+ jmp L100good_key
+.align 4,0x90
+L09314rounds:
movups 16(%eax),%xmm2
- movl $13,%ecx
leal 16(%edx),%edx
+ cmpl $268435456,%ebp
+ je L10814rounds_alt
+ movl $13,%ecx
movups %xmm0,-32(%edx)
movups %xmm2,-16(%edx)
.byte 102,15,58,223,202,1
- call L097key_256a_cold
+ call L109key_256a_cold
.byte 102,15,58,223,200,1
- call L098key_256b
+ call L110key_256b
.byte 102,15,58,223,202,2
- call L099key_256a
+ call L111key_256a
.byte 102,15,58,223,200,2
- call L098key_256b
+ call L110key_256b
.byte 102,15,58,223,202,4
- call L099key_256a
+ call L111key_256a
.byte 102,15,58,223,200,4
- call L098key_256b
+ call L110key_256b
.byte 102,15,58,223,202,8
- call L099key_256a
+ call L111key_256a
.byte 102,15,58,223,200,8
- call L098key_256b
+ call L110key_256b
.byte 102,15,58,223,202,16
- call L099key_256a
+ call L111key_256a
.byte 102,15,58,223,200,16
- call L098key_256b
+ call L110key_256b
.byte 102,15,58,223,202,32
- call L099key_256a
+ call L111key_256a
.byte 102,15,58,223,200,32
- call L098key_256b
+ call L110key_256b
.byte 102,15,58,223,202,64
- call L099key_256a
+ call L111key_256a
movups %xmm0,(%edx)
movl %ecx,16(%edx)
xorl %eax,%eax
- ret
+ jmp L100good_key
.align 4,0x90
-L099key_256a:
+L111key_256a:
movups %xmm2,(%edx)
leal 16(%edx),%edx
-L097key_256a_cold:
+L109key_256a_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
xorps %xmm1,%xmm0
ret
.align 4,0x90
-L098key_256b:
+L110key_256b:
movups %xmm0,(%edx)
leal 16(%edx),%edx
shufps $16,%xmm2,%xmm4
shufps $170,%xmm1,%xmm1
xorps %xmm1,%xmm2
ret
+.align 4,0x90
+L10814rounds_alt:
+ movdqa (%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $7,%ecx
+ movdqu %xmm0,-32(%edx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,-16(%edx)
+L112loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ decl %ecx
+ jz L113done_key256
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%edx)
+ leal 32(%edx),%edx
+ movdqa %xmm2,%xmm1
+ jmp L112loop_key256
+L113done_key256:
+ movl $13,%ecx
+ movl %ecx,16(%edx)
+L100good_key:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ xorl %eax,%eax
+ popl %ebx
+ popl %ebp
+ ret
.align 2,0x90
-L086bad_pointer:
+L091bad_pointer:
movl $-1,%eax
+ popl %ebx
+ popl %ebp
ret
.align 2,0x90
-L089bad_keybits:
+L095bad_keybits:
+ pxor %xmm0,%xmm0
movl $-2,%eax
+ popl %ebx
+ popl %ebp
ret
.globl _aesni_set_encrypt_key
.align 4
movl 12(%esp),%edx
shll $4,%ecx
testl %eax,%eax
- jnz L100dec_key_ret
+ jnz L114dec_key_ret
leal 16(%edx,%ecx,1),%eax
movups (%edx),%xmm0
movups (%eax),%xmm1
movups %xmm1,(%edx)
leal 16(%edx),%edx
leal -16(%eax),%eax
-L101dec_key_inverse:
+L115dec_key_inverse:
movups (%edx),%xmm0
movups (%eax),%xmm1
.byte 102,15,56,219,192
movups %xmm0,16(%eax)
movups %xmm1,-16(%edx)
cmpl %edx,%eax
- ja L101dec_key_inverse
+ ja L115dec_key_inverse
movups (%edx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
xorl %eax,%eax
-L100dec_key_ret:
+L114dec_key_ret:
ret
+.align 6,0x90
+Lkey_const:
+.long 202313229,202313229,202313229,202313229
+.long 67569157,67569157,67569157,67569157
+.long 1,1,1,1
+.long 27,27,27,27
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
.byte 115,108,46,111,114,103,62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L__gnutls_x86_cpuid_s$non_lazy_ptr:
+.indirect_symbol __gnutls_x86_cpuid_s
+.long 0
+.comm __gnutls_x86_cpuid_s,16,2
# *** This file is auto-generated ***
#
.text
+
.globl _aesni_encrypt
.p2align 4
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
- jnz L$oop_enc1_1
+ jnz L$oop_enc1_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
.byte 0xf3,0xc3
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
- jnz L$oop_dec1_2
+ jnz L$oop_dec1_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+_aesni_encrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+L$enc_loop2:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$enc_loop2
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+_aesni_decrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+L$dec_loop2:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$dec_loop2
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
.byte 0xf3,0xc3
.p2align 4
_aesni_encrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
L$enc_loop3:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$enc_loop3
.byte 102,15,56,220,209
.p2align 4
_aesni_decrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
L$dec_loop3:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$dec_loop3
.byte 102,15,56,222,209
.p2align 4
_aesni_encrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
L$enc_loop4:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$enc_loop4
.byte 102,15,56,220,209
.p2align 4
_aesni_decrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
L$dec_loop4:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$dec_loop4
.byte 102,15,56,222,209
.p2align 4
_aesni_encrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
+.byte 102,15,56,220,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,220,241
- movups (%rcx),%xmm0
-.byte 102,15,56,220,249
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
jmp L$enc_loop6_enter
.p2align 4
L$enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
+L$enc_loop6_enter:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-L$enc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$enc_loop6
.byte 102,15,56,220,209
.p2align 4
_aesni_decrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
+.byte 102,15,56,222,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,222,241
- movups (%rcx),%xmm0
-.byte 102,15,56,222,249
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
jmp L$dec_loop6_enter
.p2align 4
L$dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
+L$dec_loop6_enter:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-L$dec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$dec_loop6
.byte 102,15,56,222,209
.p2align 4
_aesni_encrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,209
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,220,241
pxor %xmm0,%xmm8
-.byte 102,15,56,220,249
+.byte 102,15,56,220,217
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
- jmp L$enc_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp L$enc_loop8_inner
.p2align 4
L$enc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
+L$enc_loop8_inner:
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
L$enc_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$enc_loop8
.byte 102,15,56,220,209
.p2align 4
_aesni_decrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,222,209
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,222,241
pxor %xmm0,%xmm8
-.byte 102,15,56,222,249
+.byte 102,15,56,222,217
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
- jmp L$dec_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp L$dec_loop8_inner
.p2align 4
L$dec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
+L$dec_loop8_inner:
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
L$dec_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$dec_loop8
.byte 102,15,56,222,209
testl %r8d,%r8d
jz L$ecb_decrypt
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb L$ecb_enc_tail
movdqu (%rdi),%xmm2
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp L$ecb_enc_loop8_enter
.p2align 4
L$ecb_enc_loop8:
call _aesni_encrypt8
- subq $128,%rdx
+ subq $0x80,%rdx
jnc L$ecb_enc_loop8
movups %xmm2,(%rsi)
movups %xmm8,96(%rsi)
movups %xmm9,112(%rsi)
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz L$ecb_ret
L$ecb_enc_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb L$ecb_enc_one
movups 16(%rdi),%xmm3
je L$ecb_enc_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb L$ecb_enc_three
movups 48(%rdi),%xmm5
je L$ecb_enc_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb L$ecb_enc_five
movups 80(%rdi),%xmm7
je L$ecb_enc_six
movdqu 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
call _aesni_encrypt8
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_enc1_3
+ jnz L$oop_enc1_3
.byte 102,15,56,221,209
movups %xmm2,(%rsi)
jmp L$ecb_ret
.p2align 4
L$ecb_enc_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
+ call _aesni_encrypt2
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
jmp L$ecb_ret
.p2align 4
L$ecb_decrypt:
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb L$ecb_dec_tail
movdqu (%rdi),%xmm2
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp L$ecb_dec_loop8_enter
.p2align 4
L$ecb_dec_loop8:
call _aesni_decrypt8
movups (%r11),%xmm0
- subq $128,%rdx
+ subq $0x80,%rdx
jnc L$ecb_dec_loop8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movq %r11,%rcx
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movl %r10d,%eax
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
movups %xmm9,112(%rsi)
+ pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz L$ecb_ret
L$ecb_dec_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb L$ecb_dec_one
movups 16(%rdi),%xmm3
je L$ecb_dec_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb L$ecb_dec_three
movups 48(%rdi),%xmm5
je L$ecb_dec_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb L$ecb_dec_five
movups 80(%rdi),%xmm7
je L$ecb_dec_six
movups 96(%rdi),%xmm8
movups (%rcx),%xmm0
+ xorps %xmm9,%xmm9
call _aesni_decrypt8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp L$ecb_ret
.p2align 4
L$ecb_dec_one:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_dec1_4
+ jnz L$oop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp L$ecb_ret
.p2align 4
L$ecb_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+ call _aesni_decrypt2
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
jmp L$ecb_ret
.p2align 4
L$ecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
jmp L$ecb_ret
.p2align 4
L$ecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
jmp L$ecb_ret
.p2align 4
L$ecb_dec_five:
xorps %xmm7,%xmm7
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
jmp L$ecb_ret
.p2align 4
L$ecb_dec_six:
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
L$ecb_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
.byte 0xf3,0xc3
.globl _aesni_ccm64_encrypt_blocks
.p2align 4
_aesni_ccm64_encrypt_blocks:
movl 240(%rcx),%eax
- movdqu (%r8),%xmm9
- movdqa L$increment64(%rip),%xmm6
+ movdqu (%r8),%xmm6
+ movdqa L$increment64(%rip),%xmm9
movdqa L$bswap_mask(%rip),%xmm7
- shrl $1,%eax
+ shll $4,%eax
+ movl $16,%r10d
leaq 0(%rcx),%r11
movdqu (%r9),%xmm3
- movdqa %xmm9,%xmm2
- movl %eax,%r10d
-.byte 102,68,15,56,0,207
+ movdqa %xmm6,%xmm2
+ leaq 32(%rcx,%rax,1),%rcx
+.byte 102,15,56,0,247
+ subq %rax,%r10
jmp L$ccm64_enc_outer
.p2align 4
L$ccm64_enc_outer:
movups (%r11),%xmm0
- movl %r10d,%eax
+ movq %r10,%rax
movups (%rdi),%xmm8
xorps %xmm0,%xmm2
movups 16(%r11),%xmm1
xorps %xmm8,%xmm0
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm3
- movups (%rcx),%xmm0
+ movups 32(%r11),%xmm0
L$ccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
+ decq %rdx
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decq %rdx
leaq 16(%rdi),%rdi
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
- leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
+ leaq 16(%rsi),%rsi
jnz L$ccm64_enc_outer
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.globl _aesni_ccm64_decrypt_blocks
.p2align 4
_aesni_ccm64_decrypt_blocks:
movl 240(%rcx),%eax
- movups (%r8),%xmm9
+ movups (%r8),%xmm6
movdqu (%r9),%xmm3
- movdqa L$increment64(%rip),%xmm6
+ movdqa L$increment64(%rip),%xmm9
movdqa L$bswap_mask(%rip),%xmm7
- movaps %xmm9,%xmm2
+ movaps %xmm6,%xmm2
movl %eax,%r10d
movq %rcx,%r11
-.byte 102,68,15,56,0,207
+.byte 102,15,56,0,247
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_enc1_5
+ jnz L$oop_enc1_5
.byte 102,15,56,221,209
+ shll $4,%r10d
+ movl $16,%eax
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
leaq 16(%rdi),%rdi
+ subq %r10,%rax
+ leaq 32(%r11,%r10,1),%rcx
+ movq %rax,%r10
jmp L$ccm64_dec_outer
.p2align 4
L$ccm64_dec_outer:
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
- movl %r10d,%eax
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
jz L$ccm64_dec_break
movups (%r11),%xmm0
- shrl $1,%eax
+ movq %r10,%rax
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm2
xorps %xmm8,%xmm3
- movups (%rcx),%xmm0
-
+ movups 32(%r11),%xmm0
+ jmp L$ccm64_dec2_loop
+.p2align 4
L$ccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$ccm64_dec2_loop
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leaq 16(%rdi),%rdi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
+ leaq 16(%rdi),%rdi
jmp L$ccm64_dec_outer
.p2align 4
L$ccm64_dec_break:
+ movl 240(%r11),%eax
movups (%r11),%xmm0
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
decl %eax
movups (%r11),%xmm1
leaq 16(%r11),%r11
- jnz L$oop_enc1_6
+ jnz L$oop_enc1_6
.byte 102,15,56,221,217
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.globl _aesni_ctr32_encrypt_blocks
.p2align 4
_aesni_ctr32_encrypt_blocks:
+ cmpq $1,%rdx
+ jne L$ctr32_bulk
+
+
+
+ movups (%r8),%xmm2
+ movups (%rdi),%xmm3
+ movl 240(%rcx),%edx
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_enc1_7:
+.byte 102,15,56,220,209
+ decl %edx
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_enc1_7
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ xorps %xmm2,%xmm2
+ jmp L$ctr32_epilogue
+
+.p2align 4
+L$ctr32_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $128,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
- cmpq $1,%rdx
- je L$ctr32_one_shortcut
+
+
movdqu (%r8),%xmm2
movdqu (%rcx),%xmm0
movdqa %xmm2,64(%rsp)
movdqa %xmm2,80(%rsp)
movdqa %xmm2,96(%rsp)
+ movq %rdx,%r10
movdqa %xmm2,112(%rsp)
- movl 240(%rcx),%eax
-
- leaq 1(%r8),%r9
- leaq 2(%r8),%r10
- bswapl %r9d
- bswapl %r10d
- xorl %r11d,%r9d
- xorl %r11d,%r10d
-.byte 102,65,15,58,34,217,3
- leaq 3(%r8),%r9
+ leaq 1(%r8),%rax
+ leaq 2(%r8),%rdx
+ bswapl %eax
+ bswapl %edx
+ xorl %r11d,%eax
+ xorl %r11d,%edx
+.byte 102,15,58,34,216,3
+ leaq 3(%r8),%rax
movdqa %xmm3,16(%rsp)
-.byte 102,65,15,58,34,226,3
- bswapl %r9d
+.byte 102,15,58,34,226,3
+ bswapl %eax
+ movq %r10,%rdx
leaq 4(%r8),%r10
movdqa %xmm4,32(%rsp)
- xorl %r11d,%r9d
+ xorl %r11d,%eax
bswapl %r10d
-.byte 102,65,15,58,34,233,3
+.byte 102,15,58,34,232,3
xorl %r11d,%r10d
movdqa %xmm5,48(%rsp)
leaq 5(%r8),%r9
movl %r10d,64+12(%rsp)
bswapl %r9d
leaq 6(%r8),%r10
+ movl 240(%rcx),%eax
xorl %r11d,%r9d
bswapl %r10d
movl %r9d,80+12(%rsp)
leaq 7(%r8),%r9
movl %r10d,96+12(%rsp)
bswapl %r9d
+ movl __gnutls_x86_cpuid_s+4(%rip),%r10d
xorl %r11d,%r9d
+ andl $71303168,%r10d
movl %r9d,112+12(%rsp)
movups 16(%rcx),%xmm1
cmpq $8,%rdx
jb L$ctr32_tail
+ subq $6,%rdx
+ cmpl $4194304,%r10d
+ je L$ctr32_6x
+
leaq 128(%rcx),%rcx
- subq $8,%rdx
+ subq $2,%rdx
jmp L$ctr32_loop8
+.p2align 4
+L$ctr32_6x:
+ shll $4,%eax
+ movl $48,%r10d
+ bswapl %r11d
+ leaq 32(%rcx,%rax,1),%rcx
+ subq %rax,%r10
+ jmp L$ctr32_loop6
+
+.p2align 4
+L$ctr32_loop6:
+ addl $6,%r8d
+ movups -48(%rcx,%r10,1),%xmm0
+.byte 102,15,56,220,209
+ movl %r8d,%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,217
+.byte 0x0f,0x38,0xf1,0x44,0x24,12
+ leal 1(%r8),%eax
+.byte 102,15,56,220,225
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,28
+.byte 102,15,56,220,233
+ leal 2(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,241
+.byte 0x0f,0x38,0xf1,0x44,0x24,44
+ leal 3(%r8),%eax
+.byte 102,15,56,220,249
+ movups -32(%rcx,%r10,1),%xmm1
+ xorl %r11d,%eax
+
+.byte 102,15,56,220,208
+.byte 0x0f,0x38,0xf1,0x44,0x24,60
+ leal 4(%r8),%eax
+.byte 102,15,56,220,216
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,76
+.byte 102,15,56,220,224
+ leal 5(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,232
+.byte 0x0f,0x38,0xf1,0x44,0x24,92
+ movq %r10,%rax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%rcx,%r10,1),%xmm0
+
+ call L$enc_loop6
+
+ movdqu (%rdi),%xmm8
+ movdqu 16(%rdi),%xmm9
+ movdqu 32(%rdi),%xmm10
+ movdqu 48(%rdi),%xmm11
+ movdqu 64(%rdi),%xmm12
+ movdqu 80(%rdi),%xmm13
+ leaq 96(%rdi),%rdi
+ movups -64(%rcx,%r10,1),%xmm1
+ pxor %xmm2,%xmm8
+ movaps 0(%rsp),%xmm2
+ pxor %xmm3,%xmm9
+ movaps 16(%rsp),%xmm3
+ pxor %xmm4,%xmm10
+ movaps 32(%rsp),%xmm4
+ pxor %xmm5,%xmm11
+ movaps 48(%rsp),%xmm5
+ pxor %xmm6,%xmm12
+ movaps 64(%rsp),%xmm6
+ pxor %xmm7,%xmm13
+ movaps 80(%rsp),%xmm7
+ movdqu %xmm8,(%rsi)
+ movdqu %xmm9,16(%rsi)
+ movdqu %xmm10,32(%rsi)
+ movdqu %xmm11,48(%rsi)
+ movdqu %xmm12,64(%rsi)
+ movdqu %xmm13,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ subq $6,%rdx
+ jnc L$ctr32_loop6
+
+ addq $6,%rdx
+ jz L$ctr32_done
+
+ leal -48(%r10),%eax
+ leaq -80(%rcx,%r10,1),%rcx
+ negl %eax
+ shrl $4,%eax
+ jmp L$ctr32_tail
+
.p2align 5
L$ctr32_loop8:
addl $8,%r8d
movups 32-128(%rcx),%xmm0
.byte 102,15,56,220,225
xorl %r11d,%r9d
+ nop
.byte 102,15,56,220,233
movl %r9d,0+12(%rsp)
leaq 1(%r8),%r9
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 48-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,16+12(%rsp)
leaq 2(%r8),%r9
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 64-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,32+12(%rsp)
leaq 3(%r8),%r9
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 80-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,48+12(%rsp)
leaq 4(%r8),%r9
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 96-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,64+12(%rsp)
leaq 5(%r8),%r9
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 112-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,80+12(%rsp)
leaq 6(%r8),%r9
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 128-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,96+12(%rsp)
leaq 7(%r8),%r9
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 144-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
.byte 102,15,56,220,224
xorl %r11d,%r9d
+ movdqu 0(%rdi),%xmm10
.byte 102,15,56,220,232
movl %r9d,112+12(%rsp)
+ cmpl $11,%eax
.byte 102,15,56,220,240
.byte 102,15,56,220,248
.byte 102,68,15,56,220,192
- movdqu 0(%rdi),%xmm10
.byte 102,68,15,56,220,200
movups 160-128(%rcx),%xmm0
- cmpl $11,%eax
jb L$ctr32_enc_done
.byte 102,15,56,220,209
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 224-128(%rcx),%xmm0
+ jmp L$ctr32_enc_done
+.p2align 4
L$ctr32_enc_done:
movdqu 16(%rdi),%xmm11
pxor %xmm0,%xmm10
pxor %xmm0,%xmm13
movdqu 80(%rdi),%xmm15
pxor %xmm0,%xmm14
-.byte 102,15,56,220,209
pxor %xmm0,%xmm15
+.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movdqu 96(%rdi),%xmm1
+ leaq 128(%rdi),%rdi
.byte 102,65,15,56,221,210
pxor %xmm0,%xmm1
- movdqu 112(%rdi),%xmm10
- leaq 128(%rdi),%rdi
+ movdqu 112-128(%rdi),%xmm10
.byte 102,65,15,56,221,219
pxor %xmm0,%xmm10
movdqa 0(%rsp),%xmm11
.byte 102,65,15,56,221,228
- movdqa 16(%rsp),%xmm12
.byte 102,65,15,56,221,237
+ movdqa 16(%rsp),%xmm12
movdqa 32(%rsp),%xmm13
.byte 102,65,15,56,221,246
- movdqa 48(%rsp),%xmm14
.byte 102,65,15,56,221,255
+ movdqa 48(%rsp),%xmm14
movdqa 64(%rsp),%xmm15
.byte 102,68,15,56,221,193
movdqa 80(%rsp),%xmm0
-.byte 102,69,15,56,221,202
movups 16-128(%rcx),%xmm1
+.byte 102,69,15,56,221,202
movups %xmm2,(%rsi)
movdqa %xmm11,%xmm2
leaq -128(%rcx),%rcx
L$ctr32_tail:
+
+
leaq 16(%rcx),%rcx
cmpq $4,%rdx
jb L$ctr32_loop3
je L$ctr32_loop4
+
+ shll $4,%eax
movdqa 96(%rsp),%xmm8
pxor %xmm9,%xmm9
movups 16(%rcx),%xmm0
.byte 102,15,56,220,209
- leaq 16(%rcx),%rcx
.byte 102,15,56,220,217
- shrl $1,%eax
+ leaq 32-16(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,220,225
- decl %eax
-.byte 102,15,56,220,233
+ addq $16,%rax
movups (%rdi),%xmm10
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
movups 16(%rdi),%xmm11
-.byte 102,15,56,220,249
movups 32(%rdi),%xmm12
+.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
- movups 16(%rcx),%xmm1
call L$enc_loop8_enter
L$ctr32_loop4:
.byte 102,15,56,220,209
leaq 16(%rcx),%rcx
+ decl %eax
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.byte 102,15,56,220,233
movups (%rcx),%xmm1
- decl %eax
jnz L$ctr32_loop4
.byte 102,15,56,221,209
- movups (%rdi),%xmm10
.byte 102,15,56,221,217
+ movups (%rdi),%xmm10
movups 16(%rdi),%xmm11
.byte 102,15,56,221,225
- movups 32(%rdi),%xmm12
.byte 102,15,56,221,233
+ movups 32(%rdi),%xmm12
movups 48(%rdi),%xmm13
xorps %xmm10,%xmm2
L$ctr32_loop3:
.byte 102,15,56,220,209
leaq 16(%rcx),%rcx
+ decl %eax
.byte 102,15,56,220,217
.byte 102,15,56,220,225
movups (%rcx),%xmm1
- decl %eax
jnz L$ctr32_loop3
.byte 102,15,56,221,209
.byte 102,15,56,221,217
movups 32(%rdi),%xmm12
xorps %xmm12,%xmm4
movups %xmm4,32(%rsi)
- jmp L$ctr32_done
-.p2align 4
-L$ctr32_one_shortcut:
- movups (%r8),%xmm2
- movups (%rdi),%xmm10
- movl 240(%rcx),%eax
- movups (%rcx),%xmm0
- movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
- xorps %xmm0,%xmm2
-L$oop_enc1_7:
-.byte 102,15,56,220,209
- decl %eax
- movups (%rcx),%xmm1
- leaq 16(%rcx),%rcx
- jnz L$oop_enc1_7
-.byte 102,15,56,221,209
- xorps %xmm10,%xmm2
- movups %xmm2,(%rsi)
- jmp L$ctr32_done
-
-.p2align 4
L$ctr32_done:
+ xorps %xmm0,%xmm0
+ xorl %r11d,%r11d
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ movaps %xmm0,112(%rsp)
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$ctr32_epilogue:
_aesni_xts_encrypt:
leaq (%rsp),%rax
pushq %rbp
- subq $96,%rsp
+ subq $112,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
- movups (%r9),%xmm15
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
L$oop_enc1_8:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz L$oop_enc1_8
-.byte 102,68,15,56,221,249
+ jnz L$oop_enc1_8
+.byte 102,15,56,221,209
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa L$xts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $0x5f,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc L$xts_enc_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq L$xts_magic(%rip),%r8
jmp L$xts_enc_grandloop
-.p2align 4
+.p2align 5
L$xts_enc_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,220,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,220,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,220,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,220,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,220,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,220,208
+ pxor %xmm9,%xmm13
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,220,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,220,216
+ pxor %xmm9,%xmm14
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,220,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,220,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,220,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp L$xts_enc_loop6_enter
-
-.p2align 4
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $0x5f,%xmm15,%xmm9
+ jmp L$xts_enc_loop6
+.p2align 5
L$xts_enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-L$xts_enc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -80(%rcx,%rax,1),%xmm0
jnz L$xts_enc_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
.byte 102,15,56,220,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,220,249
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,208
- pand %xmm8,%xmm9
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,220,216
- pcmpgtd %xmm15,%xmm14
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,220,224
- pxor %xmm9,%xmm15
.byte 102,15,56,220,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,248
- movups 32(%rcx),%xmm0
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,249
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,220,216
paddq %xmm15,%xmm15
-.byte 102,15,56,221,208
- pand %xmm8,%xmm9
-.byte 102,15,56,221,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,221,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,220,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,220,217
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,221,84,36,0
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+.byte 102,15,56,221,92,36,16
+.byte 102,15,56,221,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,221,108,36,48
+.byte 102,15,56,221,116,36,64
+.byte 102,15,56,221,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc L$xts_enc_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
L$xts_enc_short:
+
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
addq $96,%rdx
jz L$xts_enc_done
- cmpq $32,%rdx
+ pxor %xmm0,%xmm11
+ cmpq $0x20,%rdx
jb L$xts_enc_one
+ pxor %xmm0,%xmm12
je L$xts_enc_two
- cmpq $64,%rdx
+ pxor %xmm0,%xmm13
+ cmpq $0x40,%rdx
jb L$xts_enc_three
+ pxor %xmm0,%xmm14
je L$xts_enc_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
pxor %xmm13,%xmm5
pxor %xmm14,%xmm6
+ pxor %xmm7,%xmm7
call _aesni_encrypt6
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_enc1_9
+ jnz L$oop_enc1_9
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_encrypt3
+ call _aesni_encrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
call _aesni_encrypt4
- xorps %xmm10,%xmm2
- movdqa %xmm15,%xmm10
- xorps %xmm11,%xmm3
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp L$xts_enc_done
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_enc1_10
+ jnz L$oop_enc1_10
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movups %xmm2,-16(%rsi)
L$xts_enc_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$xts_enc_epilogue:
_aesni_xts_decrypt:
leaq (%rsp),%rax
pushq %rbp
- subq $96,%rsp
+ subq $112,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
- movups (%r9),%xmm15
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
L$oop_enc1_11:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz L$oop_enc1_11
-.byte 102,68,15,56,221,249
+ jnz L$oop_enc1_11
+.byte 102,15,56,221,209
xorl %eax,%eax
testq $15,%rdx
setnz %al
shlq $4,%rax
subq %rax,%rdx
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa L$xts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $0x5f,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc L$xts_dec_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq L$xts_magic(%rip),%r8
jmp L$xts_dec_grandloop
-.p2align 4
+.p2align 5
L$xts_dec_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
- pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
- pxor %xmm13,%xmm5
- movups (%r11),%xmm0
- pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
-
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
- movdqa %xmm10,0(%rsp)
.byte 102,15,56,222,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
- movdqa %xmm11,16(%rsp)
+ movdqu 48(%rdi),%xmm5
+ pxor %xmm12,%xmm4
.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
- movdqa %xmm12,32(%rsp)
+ movdqu 64(%rdi),%xmm6
+ pxor %xmm13,%xmm5
.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
+ pxor %xmm14,%xmm6
.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
- movdqa %xmm14,64(%rsp)
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
+
+ pxor %xmm9,%xmm10
.byte 102,15,56,222,241
- movdqa %xmm15,80(%rsp)
+ pxor %xmm9,%xmm11
+ movdqa %xmm10,0(%rsp)
.byte 102,15,56,222,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp L$xts_dec_loop6_enter
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
-.p2align 4
+.byte 102,15,56,222,208
+ pxor %xmm9,%xmm13
+ movdqa %xmm11,16(%rsp)
+.byte 102,15,56,222,216
+ pxor %xmm9,%xmm14
+ movdqa %xmm12,32(%rsp)
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ pxor %xmm9,%xmm8
+ movdqa %xmm14,64(%rsp)
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $0x5f,%xmm15,%xmm9
+ jmp L$xts_dec_loop6
+.p2align 5
L$xts_dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-L$xts_dec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
+ movups -80(%rcx,%rax,1),%xmm0
jnz L$xts_dec_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
.byte 102,15,56,222,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,222,249
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,208
- pand %xmm8,%xmm9
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,222,216
- pcmpgtd %xmm15,%xmm14
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,222,224
- pxor %xmm9,%xmm15
.byte 102,15,56,222,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,248
- movups 32(%rcx),%xmm0
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,249
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,222,216
paddq %xmm15,%xmm15
-.byte 102,15,56,223,208
- pand %xmm8,%xmm9
-.byte 102,15,56,223,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,223,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,222,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,222,217
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,223,84,36,0
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+.byte 102,15,56,223,92,36,16
+.byte 102,15,56,223,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,223,108,36,48
+.byte 102,15,56,223,116,36,64
+.byte 102,15,56,223,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc L$xts_dec_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
L$xts_dec_short:
+
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
addq $96,%rdx
jz L$xts_dec_done
- cmpq $32,%rdx
+ pxor %xmm0,%xmm12
+ cmpq $0x20,%rdx
jb L$xts_dec_one
+ pxor %xmm0,%xmm13
je L$xts_dec_two
- cmpq $64,%rdx
+ pxor %xmm0,%xmm14
+ cmpq $0x40,%rdx
jb L$xts_dec_three
je L$xts_dec_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
pcmpgtd %xmm15,%xmm14
movdqu %xmm6,64(%rsi)
leaq 80(%rsi),%rsi
- pshufd $19,%xmm14,%xmm11
+ pshufd $0x13,%xmm14,%xmm11
andq $15,%r9
jz L$xts_dec_ret
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_dec1_12
+ jnz L$oop_dec1_12
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_decrypt3
+ call _aesni_decrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
xorps %xmm10,%xmm2
movdqa %xmm13,%xmm10
xorps %xmm11,%xmm3
- movdqa %xmm15,%xmm11
+ movdqa %xmm14,%xmm11
xorps %xmm12,%xmm4
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
.p2align 4
L$xts_dec_four:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movups (%rdi),%xmm2
- pand %xmm8,%xmm9
movups 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movups 32(%rdi),%xmm4
xorps %xmm10,%xmm2
movups 48(%rdi),%xmm5
call _aesni_decrypt4
- xorps %xmm10,%xmm2
+ pxor %xmm10,%xmm2
movdqa %xmm14,%xmm10
- xorps %xmm11,%xmm3
+ pxor %xmm11,%xmm3
movdqa %xmm15,%xmm11
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp L$xts_dec_done
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_dec1_13
+ jnz L$oop_dec1_13
.byte 102,15,56,223,209
xorps %xmm11,%xmm2
movups %xmm2,(%rsi)
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_dec1_14
+ jnz L$oop_dec1_14
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movups %xmm2,(%rsi)
L$xts_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$xts_dec_epilogue:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_enc1_15
+ jnz L$oop_enc1_15
.byte 102,15,56,221,209
movl %r10d,%eax
movq %r11,%rcx
jnc L$cbc_enc_loop
addq $16,%rdx
jnz L$cbc_enc_tail
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%r8)
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
jmp L$cbc_ret
L$cbc_enc_tail:
movq %rdx,%rcx
xchgq %rdi,%rsi
-.long 0x9066A4F3
+.long 0x9066A4F3
movl $16,%ecx
subq %rdx,%rcx
xorl %eax,%eax
-.long 0x9066AAF3
+.long 0x9066AAF3
leaq -16(%rdi),%rdi
movl %r10d,%eax
movq %rdi,%rsi
movq %r11,%rcx
xorq %rdx,%rdx
- jmp L$cbc_enc_loop
+ jmp L$cbc_enc_loop
.p2align 4
L$cbc_decrypt:
+ cmpq $16,%rdx
+ jne L$cbc_decrypt_bulk
+
+
+
+ movdqu (%rdi),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa %xmm2,%xmm4
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_dec1_16:
+.byte 102,15,56,222,209
+ decl %r10d
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_dec1_16
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movdqu %xmm4,(%r8)
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp L$cbc_ret
+.p2align 4
+L$cbc_decrypt_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $16,%rsp
leaq -8(%rax),%rbp
movups (%r8),%xmm10
movl %r10d,%eax
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe L$cbc_dec_tail
movups (%rcx),%xmm0
movdqa %xmm5,%xmm14
movdqu 80(%rdi),%xmm7
movdqa %xmm6,%xmm15
- cmpq $112,%rdx
+ movl __gnutls_x86_cpuid_s+4(%rip),%r9d
+ cmpq $0x70,%rdx
jbe L$cbc_dec_six_or_seven
- subq $112,%rdx
+ andl $71303168,%r9d
+ subq $0x50,%rdx
+ cmpl $4194304,%r9d
+ je L$cbc_dec_loop6_enter
+ subq $0x20,%rdx
leaq 112(%rcx),%rcx
jmp L$cbc_dec_loop8_enter
.p2align 4
movups 16-112(%rcx),%xmm1
pxor %xmm0,%xmm4
xorq %r11,%r11
- cmpq $112,%rdx
+ cmpq $0x70,%rdx
pxor %xmm0,%xmm5
pxor %xmm0,%xmm6
pxor %xmm0,%xmm7
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
- setnc %r11b
.byte 102,68,15,56,222,193
+ setnc %r11b
shlq $7,%r11
.byte 102,68,15,56,222,201
addq %rdi,%r11
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 64-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 80-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 96-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 112-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 128-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 144-112(%rcx),%xmm1
+ cmpl $11,%eax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 160-112(%rcx),%xmm0
- cmpl $11,%eax
jb L$cbc_dec_done
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 176-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 208-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 224-112(%rcx),%xmm0
+ jmp L$cbc_dec_done
+.p2align 4
L$cbc_dec_done:
.byte 102,15,56,222,209
- pxor %xmm0,%xmm10
.byte 102,15,56,222,217
+ pxor %xmm0,%xmm10
pxor %xmm0,%xmm11
.byte 102,15,56,222,225
- pxor %xmm0,%xmm12
.byte 102,15,56,222,233
+ pxor %xmm0,%xmm12
pxor %xmm0,%xmm13
.byte 102,15,56,222,241
- pxor %xmm0,%xmm14
.byte 102,15,56,222,249
+ pxor %xmm0,%xmm14
pxor %xmm0,%xmm15
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
.byte 102,65,15,56,223,219
pxor %xmm0,%xmm10
movdqu 112(%rdi),%xmm0
- leaq 128(%rdi),%rdi
.byte 102,65,15,56,223,228
+ leaq 128(%rdi),%rdi
movdqu 0(%r11),%xmm11
.byte 102,65,15,56,223,237
- movdqu 16(%r11),%xmm12
.byte 102,65,15,56,223,246
+ movdqu 16(%r11),%xmm12
movdqu 32(%r11),%xmm13
.byte 102,65,15,56,223,255
- movdqu 48(%r11),%xmm14
.byte 102,68,15,56,223,193
+ movdqu 48(%r11),%xmm14
movdqu 64(%r11),%xmm15
.byte 102,69,15,56,223,202
movdqa %xmm0,%xmm10
movups %xmm8,96(%rsi)
leaq 112(%rsi),%rsi
- subq $128,%rdx
+ subq $0x80,%rdx
ja L$cbc_dec_loop8
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
- addq $112,%rdx
- jle L$cbc_dec_tail_collected
+ addq $0x70,%rdx
+ jle L$cbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe L$cbc_dec_tail
movaps %xmm11,%xmm2
L$cbc_dec_six_or_seven:
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
ja L$cbc_dec_seven
movaps %xmm7,%xmm8
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
leaq 80(%rsi),%rsi
movdqa %xmm7,%xmm2
+ pxor %xmm7,%xmm7
jmp L$cbc_dec_tail_collected
.p2align 4
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
pxor %xmm9,%xmm8
movdqu %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
leaq 96(%rsi),%rsi
movdqa %xmm8,%xmm2
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp L$cbc_dec_tail_collected
+.p2align 4
+L$cbc_dec_loop6:
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+L$cbc_dec_loop6_enter:
+ leaq 96(%rdi),%rdi
+ movdqa %xmm7,%xmm8
+
+ call _aesni_decrypt6
+
+ pxor %xmm10,%xmm2
+ movdqa %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movq %r11,%rcx
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movl %r10d,%eax
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ subq $0x60,%rdx
+ ja L$cbc_dec_loop6
+
+ movdqa %xmm7,%xmm2
+ addq $0x50,%rdx
+ jle L$cbc_dec_clear_tail_collected
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+
L$cbc_dec_tail:
movups (%rdi),%xmm2
- subq $16,%rdx
+ subq $0x10,%rdx
jbe L$cbc_dec_one
movups 16(%rdi),%xmm3
movaps %xmm2,%xmm11
- subq $16,%rdx
+ subq $0x10,%rdx
jbe L$cbc_dec_two
movups 32(%rdi),%xmm4
movaps %xmm3,%xmm12
- subq $16,%rdx
+ subq $0x10,%rdx
jbe L$cbc_dec_three
movups 48(%rdi),%xmm5
movaps %xmm4,%xmm13
- subq $16,%rdx
+ subq $0x10,%rdx
jbe L$cbc_dec_four
movups 64(%rdi),%xmm6
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
leaq 64(%rsi),%rsi
movdqa %xmm6,%xmm2
- subq $16,%rdx
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ subq $0x10,%rdx
jmp L$cbc_dec_tail_collected
.p2align 4
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
-L$oop_dec1_16:
+L$oop_dec1_17:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_dec1_16
+ jnz L$oop_dec1_17
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movaps %xmm11,%xmm10
.p2align 4
L$cbc_dec_two:
movaps %xmm3,%xmm12
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+ call _aesni_decrypt2
pxor %xmm10,%xmm2
movaps %xmm12,%xmm10
pxor %xmm11,%xmm3
movdqu %xmm2,(%rsi)
movdqa %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leaq 16(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movdqa %xmm4,%xmm2
+ pxor %xmm4,%xmm4
leaq 32(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movdqa %xmm5,%xmm2
+ pxor %xmm5,%xmm5
leaq 48(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
+L$cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
L$cbc_dec_tail_collected:
movups %xmm10,(%r8)
andq $15,%rdx
jnz L$cbc_dec_tail_partial
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp L$cbc_dec_ret
.p2align 4
L$cbc_dec_tail_partial:
movaps %xmm2,(%rsp)
+ pxor %xmm2,%xmm2
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
leaq (%rsp),%rsi
-.long 0x9066A4F3
+.long 0x9066A4F3
+ movdqa %xmm2,(%rsp)
L$cbc_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
leaq (%rbp),%rsp
popq %rbp
L$cbc_ret:
.p2align 4
_aesni_set_decrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
call __aesni_set_encrypt_key
shll $4,%esi
testl %eax,%eax
movups (%rdx),%xmm0
.byte 102,15,56,219,192
+ pxor %xmm1,%xmm1
movups %xmm0,(%rdi)
+ pxor %xmm0,%xmm0
L$dec_key_ret:
addq $8,%rsp
.byte 0xf3,0xc3
.p2align 4
_aesni_set_encrypt_key:
__aesni_set_encrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
movq $-1,%rax
testq %rdi,%rdi
jz L$enc_key_ret
testq %rdx,%rdx
jz L$enc_key_ret
+ movl $268437504,%r10d
movups (%rdi),%xmm0
xorps %xmm4,%xmm4
+ andl __gnutls_x86_cpuid_s+4(%rip),%r10d
leaq 16(%rdx),%rax
cmpl $256,%esi
je L$14rounds
L$10rounds:
movl $9,%esi
+ cmpl $268435456,%r10d
+ je L$10rounds_alt
+
movups %xmm0,(%rdx)
.byte 102,15,58,223,200,1
call L$key_expansion_128_cold
xorl %eax,%eax
jmp L$enc_key_ret
+.p2align 4
+L$10rounds_alt:
+ movdqa L$key_rotate(%rip),%xmm5
+ movl $8,%r10d
+ movdqa L$key_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,(%rdx)
+ jmp L$oop_key128
+
+.p2align 4
+L$oop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leaq 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ decl %r10d
+ jnz L$oop_key128
+
+ movdqa L$key_rcon1b(%rip),%xmm4
+
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ movl %esi,96(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret
+
.p2align 4
L$12rounds:
movq 16(%rdi),%xmm2
movl $11,%esi
+ cmpl $268435456,%r10d
+ je L$12rounds_alt
+
movups %xmm0,(%rdx)
.byte 102,15,58,223,202,1
call L$key_expansion_192a_cold
xorq %rax,%rax
jmp L$enc_key_ret
+.p2align 4
+L$12rounds_alt:
+ movdqa L$key_rotate192(%rip),%xmm5
+ movdqa L$key_rcon1(%rip),%xmm4
+ movl $8,%r10d
+ movdqu %xmm0,(%rdx)
+ jmp L$oop_key192
+
+.p2align 4
+L$oop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leaq 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd $0xff,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ decl %r10d
+ jnz L$oop_key192
+
+ movl %esi,32(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret
+
.p2align 4
L$14rounds:
movups 16(%rdi),%xmm2
movl $13,%esi
leaq 16(%rax),%rax
+ cmpl $268435456,%r10d
+ je L$14rounds_alt
+
movups %xmm0,(%rdx)
movups %xmm2,16(%rdx)
.byte 102,15,58,223,202,1
xorq %rax,%rax
jmp L$enc_key_ret
+.p2align 4
+L$14rounds_alt:
+ movdqa L$key_rotate(%rip),%xmm5
+ movdqa L$key_rcon1(%rip),%xmm4
+ movl $7,%r10d
+ movdqu %xmm0,0(%rdx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16(%rdx)
+ jmp L$oop_key256
+
+.p2align 4
+L$oop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ decl %r10d
+ jz L$done_key256
+
+ pshufd $0xff,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ leaq 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp L$oop_key256
+
+L$done_key256:
+ movl %esi,16(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret
+
.p2align 4
L$bad_keybits:
movq $-2,%rax
L$enc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
addq $8,%rsp
.byte 0xf3,0xc3
L$SEH_end_set_encrypt_key:
.long 0x87,0,1,0
L$increment1:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+L$key_rotate:
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+L$key_rotate192:
+.long 0x04070605,0x04070605,0x04070605,0x04070605
+L$key_rcon1:
+.long 1,1,1,1
+L$key_rcon1b:
+.long 0x1b,0x1b,0x1b,0x1b
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6
#
.text
+
.globl _gcm_gmult_4bit
.p2align 4
movq $14,%rcx
movq 8(%rsi,%rax,1),%r8
movq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
movq %r8,%rdx
jmp L$oop1
.p2align 4
L$oop1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
movb (%rdi,%rcx,1),%al
shrq $4,%r9
js L$break1
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
.p2align 4
L$break1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rbx,1),%r8
.p2align 5
_gcm_ghash_clmul:
L$_ghash_clmul:
- movdqa L$bswap_mask(%rip),%xmm5
- movq $11547335547999543296,%rax
+ movdqa L$bswap_mask(%rip),%xmm10
movdqu (%rdi),%xmm0
movdqu (%rsi),%xmm2
- movdqu 32(%rsi),%xmm10
-.byte 102,15,56,0,197
+ movdqu 32(%rsi),%xmm7
+.byte 102,65,15,56,0,194
- subq $16,%rcx
+ subq $0x10,%rcx
jz L$odd_tail
- movdqu 16(%rsi),%xmm9
- cmpq $48,%rcx
+ movdqu 16(%rsi),%xmm6
+ movl __gnutls_x86_cpuid_s+4(%rip),%eax
+ cmpq $0x30,%rcx
jb L$skip4x
- subq $48,%rcx
+ andl $71303168,%eax
+ cmpl $4194304,%eax
+ je L$skip4x
+
+ subq $0x30,%rcx
+ movq $0xA040608020C0E000,%rax
movdqu 48(%rsi),%xmm14
movdqu 64(%rsi),%xmm15
- movdqu 48(%rdx),%xmm6
+ movdqu 48(%rdx),%xmm3
movdqu 32(%rdx),%xmm11
-.byte 102,15,56,0,245
-.byte 102,68,15,56,0,221
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm7
- pxor %xmm6,%xmm7
-.byte 102,15,58,68,242,0
-.byte 102,68,15,58,68,194,17
-.byte 102,65,15,58,68,250,0
+.byte 102,65,15,56,0,218
+.byte 102,69,15,56,0,218
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
pxor %xmm11,%xmm12
-.byte 102,69,15,58,68,217,0
-.byte 102,69,15,58,68,233,17
- xorps %xmm11,%xmm6
-.byte 102,69,15,58,68,226,16
- xorps %xmm13,%xmm8
- movups 80(%rsi),%xmm10
- xorps %xmm12,%xmm7
+.byte 102,68,15,58,68,222,0
+.byte 102,68,15,58,68,238,17
+.byte 102,68,15,58,68,231,16
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+ xorps %xmm12,%xmm4
movdqu 16(%rdx),%xmm11
- movdqu 0(%rdx),%xmm3
-.byte 102,68,15,56,0,221
-.byte 102,15,56,0,221
+ movdqu 0(%rdx),%xmm8
+.byte 102,69,15,56,0,218
+.byte 102,69,15,56,0,194
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
- pxor %xmm3,%xmm0
+ pxor %xmm8,%xmm0
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm6
-.byte 102,69,15,58,68,226,0
- xorps %xmm13,%xmm8
+.byte 102,68,15,58,68,231,0
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jc L$tail4x
jmp L$mod4_loop
.p2align 5
L$mod4_loop:
.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm7
+ xorps %xmm12,%xmm4
movdqu 48(%rdx),%xmm11
-.byte 102,68,15,56,0,221
+.byte 102,69,15,56,0,218
.byte 102,65,15,58,68,207,17
- xorps %xmm6,%xmm0
- movdqu 32(%rdx),%xmm6
+ xorps %xmm3,%xmm0
+ movdqu 32(%rdx),%xmm3
movdqa %xmm11,%xmm13
+.byte 102,68,15,58,68,199,16
pshufd $78,%xmm11,%xmm12
-.byte 102,65,15,58,68,218,16
- xorps %xmm8,%xmm1
+ xorps %xmm5,%xmm1
pxor %xmm11,%xmm12
-.byte 102,15,56,0,245
- movups 32(%rsi),%xmm10
+.byte 102,65,15,56,0,218
+ movups 32(%rsi),%xmm7
+ xorps %xmm4,%xmm8
.byte 102,68,15,58,68,218,0
- xorps %xmm7,%xmm3
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm7
+ pshufd $78,%xmm3,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm6,%xmm7
- pxor %xmm1,%xmm3
- movdqa %xmm3,%xmm4
- pslldq $8,%xmm3
+ pxor %xmm0,%xmm8
+ movdqa %xmm3,%xmm5
+ pxor %xmm1,%xmm8
+ pxor %xmm3,%xmm4
+ movdqa %xmm8,%xmm9
.byte 102,68,15,58,68,234,17
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- movdqa L$7_mask(%rip),%xmm3
- pxor %xmm4,%xmm1
-.byte 102,72,15,110,224
-
- pand %xmm0,%xmm3
-.byte 102,15,56,0,227
-.byte 102,69,15,58,68,226,0
- pxor %xmm0,%xmm4
- psllq $57,%xmm4
- movdqa %xmm4,%xmm3
- pslldq $8,%xmm4
-.byte 102,65,15,58,68,241,0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- movdqu 0(%rdx),%xmm3
+ pslldq $8,%xmm8
+ psrldq $8,%xmm9
+ pxor %xmm8,%xmm0
+ movdqa L$7_mask(%rip),%xmm8
+ pxor %xmm9,%xmm1
+.byte 102,76,15,110,200
+
+ pand %xmm0,%xmm8
+.byte 102,69,15,56,0,200
+ pxor %xmm0,%xmm9
+.byte 102,68,15,58,68,231,0
+ psllq $57,%xmm9
+ movdqa %xmm9,%xmm8
+ pslldq $8,%xmm9
+.byte 102,15,58,68,222,0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ movdqu 0(%rdx),%xmm8
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm9
psrlq $1,%xmm0
-.byte 102,69,15,58,68,193,17
- xorps %xmm11,%xmm6
+.byte 102,15,58,68,238,17
+ xorps %xmm11,%xmm3
movdqu 16(%rdx),%xmm11
-.byte 102,68,15,56,0,221
-.byte 102,65,15,58,68,250,16
- xorps %xmm13,%xmm8
- movups 80(%rsi),%xmm10
-.byte 102,15,56,0,221
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
+.byte 102,69,15,56,0,218
+.byte 102,15,58,68,231,16
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+.byte 102,69,15,56,0,194
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
psrlq $5,%xmm0
movdqa %xmm11,%xmm13
- pxor %xmm12,%xmm7
+ pxor %xmm12,%xmm4
pshufd $78,%xmm11,%xmm12
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
psrlq $1,%xmm0
-.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm6
pxor %xmm1,%xmm0
-
-.byte 102,69,15,58,68,226,0
- xorps %xmm13,%xmm8
-
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+
+.byte 102,68,15,58,68,231,0
+ xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jnc L$mod4_loop
L$tail4x:
.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm7
.byte 102,65,15,58,68,207,17
- xorps %xmm6,%xmm0
-.byte 102,65,15,58,68,218,16
- xorps %xmm8,%xmm1
+.byte 102,68,15,58,68,199,16
+ xorps %xmm12,%xmm4
+ xorps %xmm3,%xmm0
+ xorps %xmm5,%xmm1
pxor %xmm0,%xmm1
- pxor %xmm7,%xmm3
+ pxor %xmm4,%xmm8
- pxor %xmm1,%xmm3
+ pxor %xmm1,%xmm8
pxor %xmm0,%xmm1
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
+ movdqa %xmm8,%xmm9
+ psrldq $8,%xmm8
+ pslldq $8,%xmm9
+ pxor %xmm8,%xmm1
+ pxor %xmm9,%xmm0
movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- addq $64,%rcx
+ addq $0x40,%rcx
jz L$done
- movdqu 32(%rsi),%xmm10
- subq $16,%rcx
+ movdqu 32(%rsi),%xmm7
+ subq $0x10,%rcx
jz L$odd_tail
L$skip4x:
- movdqu (%rdx),%xmm3
- movdqu 16(%rdx),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
- pxor %xmm3,%xmm0
+ movdqu (%rdx),%xmm8
+ movdqu 16(%rdx),%xmm3
+.byte 102,69,15,56,0,194
+.byte 102,65,15,56,0,218
+ pxor %xmm8,%xmm0
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm3
- pxor %xmm6,%xmm3
-.byte 102,15,58,68,242,0
-.byte 102,68,15,58,68,194,17
-.byte 102,65,15,58,68,218,0
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
leaq 32(%rdx),%rdx
- subq $32,%rcx
+ nop
+ subq $0x20,%rcx
jbe L$even_tail
+ nop
jmp L$mod_loop
.p2align 5
L$mod_loop:
movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,65,15,58,68,193,0
-.byte 102,65,15,58,68,201,17
-.byte 102,65,15,58,68,226,16
-
- pxor %xmm6,%xmm0
- pxor %xmm8,%xmm1
- movdqu (%rdx),%xmm8
-.byte 102,68,15,56,0,197
- movdqu 16(%rdx),%xmm6
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
- pxor %xmm8,%xmm1
- pxor %xmm3,%xmm4
-.byte 102,15,56,0,245
- movdqa %xmm4,%xmm3
- psrldq $8,%xmm3
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ movdqu (%rdx),%xmm9
+ pxor %xmm0,%xmm8
+.byte 102,69,15,56,0,202
+ movdqu 16(%rdx),%xmm3
+
+ pxor %xmm1,%xmm8
+ pxor %xmm9,%xmm1
+ pxor %xmm8,%xmm4
+.byte 102,65,15,56,0,218
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
- movdqa %xmm6,%xmm8
+ movdqa %xmm3,%xmm5
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
+ movdqa %xmm0,%xmm9
+ movdqa %xmm0,%xmm8
psllq $5,%xmm0
-.byte 102,15,58,68,242,0
- pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm8
+.byte 102,15,58,68,218,0
psllq $1,%xmm0
- pxor %xmm3,%xmm0
+ pxor %xmm8,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm3
+ movdqa %xmm0,%xmm8
pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- pshufd $78,%xmm8,%xmm3
- pxor %xmm8,%xmm3
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pshufd $78,%xmm5,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm5,%xmm4
-.byte 102,68,15,58,68,194,17
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm9
psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
+.byte 102,15,58,68,234,17
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
psrlq $5,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm9,%xmm0
+ leaq 32(%rdx),%rdx
psrlq $1,%xmm0
-.byte 102,65,15,58,68,218,0
+.byte 102,15,58,68,231,0
pxor %xmm1,%xmm0
- leaq 32(%rdx),%rdx
- subq $32,%rcx
+ subq $0x20,%rcx
ja L$mod_loop
L$even_tail:
movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,65,15,58,68,193,0
-.byte 102,65,15,58,68,201,17
-.byte 102,65,15,58,68,226,16
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
- pxor %xmm6,%xmm0
- pxor %xmm8,%xmm1
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
- pxor %xmm3,%xmm4
- movdqa %xmm4,%xmm3
- psrldq $8,%xmm3
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ pxor %xmm0,%xmm8
+ pxor %xmm1,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
movdqa %xmm0,%xmm4
jnz L$done
L$odd_tail:
- movdqu (%rdx),%xmm3
-.byte 102,15,56,0,221
- pxor %xmm3,%xmm0
+ movdqu (%rdx),%xmm8
+.byte 102,69,15,56,0,194
+ pxor %xmm8,%xmm0
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,65,15,58,68,218,0
+.byte 102,15,58,68,223,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
psrlq $1,%xmm0
pxor %xmm1,%xmm0
L$done:
-.byte 102,15,56,0,197
+.byte 102,65,15,56,0,194
movdqu %xmm0,(%rdi)
.byte 0xf3,0xc3
.p2align 5
_gcm_init_avx:
- jmp L$_init_clmul
+ vzeroupper
+
+ vmovdqu (%rsi),%xmm2
+ vpshufd $78,%xmm2,%xmm2
+
+
+ vpshufd $255,%xmm2,%xmm4
+ vpsrlq $63,%xmm2,%xmm3
+ vpsllq $1,%xmm2,%xmm2
+ vpxor %xmm5,%xmm5,%xmm5
+ vpcmpgtd %xmm4,%xmm5,%xmm5
+ vpslldq $8,%xmm3,%xmm3
+ vpor %xmm3,%xmm2,%xmm2
+
+
+ vpand L$0x1c2_polynomial(%rip),%xmm5,%xmm5
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vpunpckhqdq %xmm2,%xmm2,%xmm6
+ vmovdqa %xmm2,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ movq $4,%r10
+ jmp L$init_start_avx
+.p2align 5
+L$init_loop_avx:
+ vpalignr $8,%xmm3,%xmm4,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+L$init_start_avx:
+ vmovdqa %xmm0,%xmm5
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+ vpshufd $78,%xmm5,%xmm3
+ vpshufd $78,%xmm0,%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqu %xmm5,0(%rdi)
+ vpxor %xmm0,%xmm4,%xmm4
+ vmovdqu %xmm0,16(%rdi)
+ leaq 48(%rdi),%rdi
+ subq $1,%r10
+ jnz L$init_loop_avx
+
+ vpalignr $8,%xmm4,%xmm3,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+
+ vzeroupper
+ .byte 0xf3,0xc3
.globl _gcm_gmult_avx
.p2align 5
_gcm_ghash_avx:
- jmp L$_ghash_clmul
+ vzeroupper
+
+ vmovdqu (%rdi),%xmm10
+ leaq L$0x1c2_polynomial(%rip),%r10
+ leaq 64(%rsi),%rsi
+ vmovdqu L$bswap_mask(%rip),%xmm13
+ vpshufb %xmm13,%xmm10,%xmm10
+ cmpq $0x80,%rcx
+ jb L$short_avx
+ subq $0x80,%rcx
+
+ vmovdqu 112(%rdx),%xmm14
+ vmovdqu 0-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vmovdqu 32-64(%rsi),%xmm7
+
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm14,%xmm9,%xmm9
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 80(%rdx),%xmm14
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 48-64(%rsi),%xmm6
+ vpxor %xmm14,%xmm9,%xmm9
+ vmovdqu 64(%rdx),%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 48(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 32(%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 16(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu (%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+
+ leaq 128(%rdx),%rdx
+ cmpq $0x80,%rcx
+ jb L$tail_avx
+
+ vpxor %xmm10,%xmm15,%xmm15
+ subq $0x80,%rcx
+ jmp L$oop8x_avx
+
+.p2align 5
+L$oop8x_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 112(%rdx),%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpxor %xmm15,%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
+ vmovdqu 0-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
+ vmovdqu 32-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm3,%xmm10,%xmm10
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vxorps %xmm4,%xmm11,%xmm11
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm5,%xmm12,%xmm12
+ vxorps %xmm15,%xmm8,%xmm8
+
+ vmovdqu 80(%rdx),%xmm14
+ vpxor %xmm10,%xmm12,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm11,%xmm12,%xmm12
+ vpslldq $8,%xmm12,%xmm9
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vpsrldq $8,%xmm12,%xmm12
+ vpxor %xmm9,%xmm10,%xmm10
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vxorps %xmm12,%xmm11,%xmm11
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 64(%rdx),%xmm15
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vxorps %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vmovdqu 48(%rdx),%xmm14
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 32(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+ vxorps %xmm12,%xmm10,%xmm10
+
+ vmovdqu 16(%rdx),%xmm14
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vxorps %xmm11,%xmm12,%xmm12
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu (%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm12,%xmm15,%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+ vpxor %xmm10,%xmm15,%xmm15
+
+ leaq 128(%rdx),%rdx
+ subq $0x80,%rcx
+ jnc L$oop8x_avx
+
+ addq $0x80,%rcx
+ jmp L$tail_no_xor_avx
+
+.p2align 5
+L$short_avx:
+ vmovdqu -16(%rdx,%rcx,1),%xmm14
+ leaq (%rdx,%rcx,1),%rdx
+ vmovdqu 0-64(%rsi),%xmm6
+ vmovdqu 32-64(%rsi),%xmm7
+ vpshufb %xmm13,%xmm14,%xmm15
+
+ vmovdqa %xmm0,%xmm3
+ vmovdqa %xmm1,%xmm4
+ vmovdqa %xmm2,%xmm5
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -32(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -48(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 80-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -64(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -80(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 96-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 128-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -96(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -112(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 144-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovq 184-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jmp L$tail_avx
+
+.p2align 5
+L$tail_avx:
+ vpxor %xmm10,%xmm15,%xmm15
+L$tail_no_xor_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+
+ vmovdqu (%r10),%xmm12
+
+ vpxor %xmm0,%xmm3,%xmm10
+ vpxor %xmm1,%xmm4,%xmm11
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vpxor %xmm10,%xmm5,%xmm5
+ vpxor %xmm11,%xmm5,%xmm5
+ vpslldq $8,%xmm5,%xmm9
+ vpsrldq $8,%xmm5,%xmm5
+ vpxor %xmm9,%xmm10,%xmm10
+ vpxor %xmm5,%xmm11,%xmm11
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm11,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ cmpq $0,%rcx
+ jne L$short_avx
+
+ vpshufb %xmm13,%xmm10,%xmm10
+ vmovdqu %xmm10,(%rdi)
+ vzeroupper
+ .byte 0xf3,0xc3
.p2align 6
L$bswap_mask:
# define bit_AES 0x2000000
#endif
+#ifndef bit_AVX
+# define bit_AVX 0x10000000
+#endif
+
+#ifndef bit_MOVBE
+# define bit_MOVBE 0x00400000
+#endif
+
#define via_bit_PADLOCK (0x3 << 6)
#define via_bit_PADLOCK_PHE (0x3 << 10)
#define via_bit_PADLOCK_PHE_SHA512 (0x3 << 25)
#define INTEL_AES_NI (1<<1)
#define INTEL_SSSE3 (1<<2)
#define INTEL_PCLMUL (1<<3)
+#define INTEL_AVX (1<<4)
#define VIA_PADLOCK (1<<20)
#define VIA_PADLOCK_PHE (1<<21)
#define VIA_PADLOCK_PHE_SHA512 (1<<22)
}
}
+ if (capabilities & INTEL_AVX) {
+ if ((b & bit_AVX) && (b & bit_MOVBE)) {
+ _gnutls_x86_cpuid_s[1] |= bit_AVX|bit_MOVBE;
+ } else {
+ _gnutls_debug_log
+ ("AVX acceleration requested but not available\n");
+ }
+ }
+
if (capabilities & INTEL_PCLMUL) {
if (b & bit_PCLMUL) {
_gnutls_x86_cpuid_s[1] |= bit_PCLMUL;
}
#ifdef ASM_X86_64
+static unsigned check_avx_movbe(void)
+{
+ return ((_gnutls_x86_cpuid_s[1] & bit_AVX) && (_gnutls_x86_cpuid_s[1] & bit_MOVBE));
+}
+
static unsigned check_pclmul(void)
{
return (_gnutls_x86_cpuid_s[1] & bit_PCLMUL);
#ifdef ASM_X86_64
if (check_pclmul()) {
/* register GCM ciphers */
- _gnutls_debug_log
- ("Intel GCM accelerator was detected\n");
- ret =
- gnutls_crypto_single_cipher_register
- (GNUTLS_CIPHER_AES_128_GCM, 80,
- &_gnutls_aes_gcm_pclmul, 0);
- if (ret < 0) {
- gnutls_assert();
- }
-
- ret =
- gnutls_crypto_single_cipher_register
- (GNUTLS_CIPHER_AES_256_GCM, 80,
- &_gnutls_aes_gcm_pclmul, 0);
- if (ret < 0) {
- gnutls_assert();
+ if (check_avx_movbe()) {
+ _gnutls_debug_log
+ ("Intel GCM accelerator (AVX) was detected\n");
+ ret =
+ gnutls_crypto_single_cipher_register
+ (GNUTLS_CIPHER_AES_128_GCM, 80,
+ &_gnutls_aes_gcm_pclmul_avx, 0);
+ if (ret < 0) {
+ gnutls_assert();
+ }
+
+ ret =
+ gnutls_crypto_single_cipher_register
+ (GNUTLS_CIPHER_AES_256_GCM, 80,
+ &_gnutls_aes_gcm_pclmul_avx, 0);
+ if (ret < 0) {
+ gnutls_assert();
+ }
+ } else {
+ _gnutls_debug_log
+ ("Intel GCM accelerator was detected\n");
+ ret =
+ gnutls_crypto_single_cipher_register
+ (GNUTLS_CIPHER_AES_128_GCM, 80,
+ &_gnutls_aes_gcm_pclmul, 0);
+ if (ret < 0) {
+ gnutls_assert();
+ }
+
+ ret =
+ gnutls_crypto_single_cipher_register
+ (GNUTLS_CIPHER_AES_256_GCM, 80,
+ &_gnutls_aes_gcm_pclmul, 0);
+ if (ret < 0) {
+ gnutls_assert();
+ }
}
} else
#endif
if (p) {
capabilities = strtol(p, NULL, 0);
}
-
+
register_x86_intel_crypto(capabilities);
#ifdef ENABLE_PADLOCK
register_x86_padlock_crypto(capabilities);
exit $ret
fi
-GNUTLS_CPUID_OVERRIDE=0x8 ${PROG}
+#AESNI+PCLMUL
+GNUTLS_CPUID_OVERRIDE=0xA ${PROG}
ret=$?
if test $ret != 0; then
echo "PCLMUL cipher tests failed"
exit $ret
fi
+#AESNI+PCLMUL+AVX
+GNUTLS_CPUID_OVERRIDE=0x1A ${PROG}
+ret=$?
+if test $ret != 0; then
+ echo "PCLMUL-AVX cipher tests failed"
+ exit $ret
+fi
+
GNUTLS_CPUID_OVERRIDE=0x100000 ${PROG}
ret=$?
if test $ret != 0; then