]> git.ipfire.org Git - thirdparty/gnutls.git/commitdiff
Updated asm files to latest version under cryptogams license
authorNikos Mavrogiannopoulos <nmav@redhat.com>
Fri, 26 Apr 2019 12:43:19 +0000 (14:43 +0200)
committerNikos Mavrogiannopoulos <nmav@gnutls.org>
Thu, 27 Jun 2019 04:45:15 +0000 (06:45 +0200)
Signed-off-by: Nikos Mavrogiannopoulos <nmav@redhat.com>
57 files changed:
cfg.mk
devel/openssl
devel/perlasm/cpuid-x86.pl
devel/perlasm/sha256-ssse3-x86_64.pl [new symlink]
devel/perlasm/sha256-ssse3-x86_64.pl.license [new symlink]
lib/accelerated/aarch64/elf/aes-aarch64.s
lib/accelerated/aarch64/elf/ghash-aarch64.s
lib/accelerated/aarch64/elf/sha1-armv8.s
lib/accelerated/aarch64/elf/sha256-armv8.s
lib/accelerated/aarch64/elf/sha512-armv8.s
lib/accelerated/aarch64/macosx/aes-aarch64.s
lib/accelerated/aarch64/macosx/ghash-aarch64.s
lib/accelerated/aarch64/macosx/sha1-armv8.s
lib/accelerated/aarch64/macosx/sha256-armv8.s
lib/accelerated/aarch64/macosx/sha512-armv8.s
lib/accelerated/x86/coff/aes-ssse3-x86.s
lib/accelerated/x86/coff/aes-ssse3-x86_64.s
lib/accelerated/x86/coff/aesni-gcm-x86_64.s
lib/accelerated/x86/coff/aesni-x86.s
lib/accelerated/x86/coff/aesni-x86_64.s
lib/accelerated/x86/coff/cpuid-x86.s
lib/accelerated/x86/coff/ghash-x86_64.s
lib/accelerated/x86/coff/sha1-ssse3-x86.s
lib/accelerated/x86/coff/sha1-ssse3-x86_64.s
lib/accelerated/x86/coff/sha256-ssse3-x86.s
lib/accelerated/x86/coff/sha256-ssse3-x86_64.s [new file with mode: 0644]
lib/accelerated/x86/coff/sha512-ssse3-x86.s
lib/accelerated/x86/coff/sha512-ssse3-x86_64.s
lib/accelerated/x86/elf/aes-ssse3-x86.s
lib/accelerated/x86/elf/aes-ssse3-x86_64.s
lib/accelerated/x86/elf/aesni-gcm-x86_64.s
lib/accelerated/x86/elf/aesni-x86.s
lib/accelerated/x86/elf/aesni-x86_64.s
lib/accelerated/x86/elf/cpuid-x86.s
lib/accelerated/x86/elf/cpuid-x86_64.s
lib/accelerated/x86/elf/ghash-x86_64.s
lib/accelerated/x86/elf/sha1-ssse3-x86.s
lib/accelerated/x86/elf/sha1-ssse3-x86_64.s
lib/accelerated/x86/elf/sha256-ssse3-x86.s
lib/accelerated/x86/elf/sha256-ssse3-x86_64.s [new file with mode: 0644]
lib/accelerated/x86/elf/sha512-ssse3-x86.s
lib/accelerated/x86/elf/sha512-ssse3-x86_64.s
lib/accelerated/x86/files.mk
lib/accelerated/x86/macosx/aes-ssse3-x86.s
lib/accelerated/x86/macosx/aes-ssse3-x86_64.s
lib/accelerated/x86/macosx/aesni-gcm-x86_64.s
lib/accelerated/x86/macosx/aesni-x86.s
lib/accelerated/x86/macosx/aesni-x86_64.s
lib/accelerated/x86/macosx/cpuid-x86.s
lib/accelerated/x86/macosx/ghash-x86_64.s
lib/accelerated/x86/macosx/sha1-ssse3-x86.s
lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s
lib/accelerated/x86/macosx/sha256-ssse3-x86.s
lib/accelerated/x86/macosx/sha256-ssse3-x86_64.s [new file with mode: 0644]
lib/accelerated/x86/macosx/sha512-ssse3-x86.s
lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s
lib/accelerated/x86/sha-x86.h

diff --git a/cfg.mk b/cfg.mk
index 3837d9619e59d8195303c82d7009771e3668b8af..f7ae6408fd42d58499e41f110a68fc839b29d568 100644 (file)
--- a/cfg.mk
+++ b/cfg.mk
@@ -138,6 +138,7 @@ ASM_SOURCES_XXX := \
        lib/accelerated/x86/XXX/sha1-ssse3-x86.s \
        lib/accelerated/x86/XXX/sha1-ssse3-x86_64.s \
        lib/accelerated/x86/XXX/sha256-ssse3-x86.s \
+       lib/accelerated/x86/XXX/sha256-ssse3-x86_64.s \
        lib/accelerated/x86/XXX/sha512-ssse3-x86.s \
        lib/accelerated/x86/XXX/sha512-ssse3-x86_64.s \
        lib/accelerated/x86/XXX/aesni-gcm-x86_64.s \
@@ -158,7 +159,7 @@ X86_FILES=XXX/aesni-x86.s XXX/cpuid-x86.s XXX/sha1-ssse3-x86.s \
 
 X86_64_FILES=XXX/aesni-x86_64.s XXX/cpuid-x86_64.s XXX/ghash-x86_64.s \
        XXX/sha1-ssse3-x86_64.s XXX/sha512-ssse3-x86_64.s XXX/aes-ssse3-x86_64.s \
-       XXX/aesni-gcm-x86_64.s
+       XXX/aesni-gcm-x86_64.s XXX/sha256-ssse3-x86_64.s
 
 X86_PADLOCK_FILES=XXX/e_padlock-x86.s
 X86_64_PADLOCK_FILES=XXX/e_padlock-x86_64.s
@@ -194,27 +195,27 @@ lib/accelerated/x86/files.mk: $(ASM_SOURCES_ELF)
 
 # Appro's code
 lib/accelerated/x86/elf/%.s: devel/perlasm/%.pl .submodule.stamp 
-       cat $<.license > $@
-       CC=gcc perl $< elf >> $@
+       CC=gcc perl $< elf $@.tmp
+       cat $<.license $@.tmp > $@ && rm -f $@.tmp
        echo "" >> $@
        echo ".section .note.GNU-stack,\"\",%progbits" >> $@
        sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@
 
 lib/accelerated/x86/coff/%-x86.s: devel/perlasm/%-x86.pl .submodule.stamp 
-       cat $<.license > $@
-       CC=gcc perl $< coff >> $@
+       CC=gcc perl $< coff $@.tmp
+       cat $<.license $@.tmp > $@ && rm -f $@.tmp
        echo "" >> $@
        sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@
 
 lib/accelerated/x86/coff/%-x86_64.s: devel/perlasm/%-x86_64.pl .submodule.stamp 
-       cat $<.license > $@
-       CC=gcc perl $< mingw64 >> $@
+       CC=gcc perl $< mingw64 $@.tmp
+       cat $<.license $@.tmp > $@ && rm -f $@.tmp
        echo "" >> $@
        sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@
 
 lib/accelerated/x86/macosx/%.s: devel/perlasm/%.pl .submodule.stamp 
-       cat $<.license > $@
-       CC=gcc perl $< macosx >> $@
+       CC=gcc perl $< macosx $@.tmp
+       cat $<.license $@.tmp > $@ && rm -f $@.tmp
        echo "" >> $@
        sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@
 
index 2805ee1e095a78f596dc7adf778441e2edb9f15c..7216e9a20aee620d85185a6ddb8caa30f11f2192 160000 (submodule)
@@ -1 +1 @@
-Subproject commit 2805ee1e095a78f596dc7adf778441e2edb9f15c
+Subproject commit 7216e9a20aee620d85185a6ddb8caa30f11f2192
index fa9c14e577641c0b918b9c29c5176e9ef1621bc8..a5541d45862219d4635632b2f6ed2f39b1b71509 100644 (file)
@@ -10,6 +10,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../crypto/perlasm");
 require "x86asm.pl";
 
+$output=pop;
+open STDOUT,">$output";
+
 &asm_init($ARGV[0],$0);
 
 &function_begin_B("gnutls_cpuid");
@@ -55,3 +58,5 @@ require "x86asm.pl";
 
 &asciz("CPUID for x86");
 &asm_finish();
+
+close STDOUT;
diff --git a/devel/perlasm/sha256-ssse3-x86_64.pl b/devel/perlasm/sha256-ssse3-x86_64.pl
new file mode 120000 (symlink)
index 0000000..1261627
--- /dev/null
@@ -0,0 +1 @@
+sha512-ssse3-x86_64.pl
\ No newline at end of file
diff --git a/devel/perlasm/sha256-ssse3-x86_64.pl.license b/devel/perlasm/sha256-ssse3-x86_64.pl.license
new file mode 120000 (symlink)
index 0000000..614714a
--- /dev/null
@@ -0,0 +1 @@
+sha512-ssse3-x86_64.pl.license
\ No newline at end of file
index e5a2dc500dc96369b1924a1a287ada4c42bbbb0d..ab227a8c147c9fb97f197b3806ef45c39b9396f8 100644 (file)
@@ -40,8 +40,6 @@
 # 1 "lib/accelerated/aarch64/elf/aes-aarch64.s.tmp.S"
 # 1 "<built-in>"
 # 1 "<command-line>"
-# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3
-# 1 "<command-line>" 2
 # 1 "lib/accelerated/aarch64/elf/aes-aarch64.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
 # 2 "lib/accelerated/aarch64/elf/aes-aarch64.s.tmp.S" 2
@@ -226,6 +224,7 @@ aes_v8_set_encrypt_key:
 .type aes_v8_set_decrypt_key,%function
 .align 5
 aes_v8_set_decrypt_key:
+.inst 0xd503233f
  stp x29,x30,[sp,#-16]!
  add x29,sp,#0
  bl .Lenc_key
@@ -259,6 +258,7 @@ aes_v8_set_decrypt_key:
  eor x0,x0,x0
 .Ldec_key_abort:
  ldp x29,x30,[sp],#16
+.inst 0xd50323bf
  ret
 .size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
 .globl aes_v8_encrypt
index 3875047c5ec34a7f45c2119491d07a6609f28757..c30139985bb8549ce43254a599feb456f09bf7be 100644 (file)
 # 1 "lib/accelerated/aarch64/elf/ghash-aarch64.s.tmp.S"
 # 1 "<built-in>"
 # 1 "<command-line>"
-# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3
-# 1 "<command-line>" 2
 # 1 "lib/accelerated/aarch64/elf/ghash-aarch64.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
 # 2 "lib/accelerated/aarch64/elf/ghash-aarch64.s.tmp.S" 2
 
+
 .text
 .arch armv8-a+crypto
 .globl gcm_init_v8
@@ -193,7 +192,7 @@ gcm_ghash_v8:
 
  subs x3,x3,#32
  mov x12,#16
-# 158 "lib/accelerated/aarch64/elf/ghash-aarch64.s.tmp.S"
+# 159 "lib/accelerated/aarch64/elf/ghash-aarch64.s.tmp.S"
  ld1 {v20.2d,v21.2d},[x1],#32
  movi v19.16b,#0xe1
  ld1 {v22.2d},[x1]
index 5c588ff7a668d8fe74314fb4c1ca82a1508a199d..4b65cf6ea8280058ae503ec29dc6004367e76dd6 100644 (file)
@@ -40,8 +40,6 @@
 # 1 "lib/accelerated/aarch64/elf/sha1-armv8.s.tmp.S"
 # 1 "<built-in>"
 # 1 "<command-line>"
-# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3
-# 1 "<command-line>" 2
 # 1 "lib/accelerated/aarch64/elf/sha1-armv8.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
 # 2 "lib/accelerated/aarch64/elf/sha1-armv8.s.tmp.S" 2
index 4439e3cc6952bc6eec90d85142192365636bebe0..bc3f146c6853eecc4f8a4fc5cbf3d2b83ee277d1 100644 (file)
@@ -40,8 +40,6 @@
 # 1 "lib/accelerated/aarch64/elf/sha256-armv8.s.tmp.S"
 # 1 "<built-in>"
 # 1 "<command-line>"
-# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3
-# 1 "<command-line>" 2
 # 1 "lib/accelerated/aarch64/elf/sha256-armv8.s.tmp.S"
 # 56 "lib/accelerated/aarch64/elf/sha256-armv8.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
@@ -69,6 +67,7 @@ sha256_block_data_order:
  tst w16,#(1<<0)
  b.ne .Lneon_entry
 
+.inst 0xd503233f
  stp x29,x30,[sp,#-128]!
  add x29,sp,#0
 
@@ -1028,6 +1027,7 @@ sha256_block_data_order:
  ldp x25,x26,[x29,#64]
  ldp x27,x28,[x29,#80]
  ldp x29,x30,[sp],#128
+.inst 0xd50323bf
  ret
 .size sha256_block_data_order,.-sha256_block_data_order
 
index 0d65657dd990d85075706d0c3529dadeff8e275e..b036c2a121560ebad92311d3890d4a3fae30d640 100644 (file)
@@ -40,8 +40,6 @@
 # 1 "lib/accelerated/aarch64/elf/sha512-armv8.s.tmp.S"
 # 1 "<built-in>"
 # 1 "<command-line>"
-# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3
-# 1 "<command-line>" 2
 # 1 "lib/accelerated/aarch64/elf/sha512-armv8.s.tmp.S"
 # 56 "lib/accelerated/aarch64/elf/sha512-armv8.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
@@ -67,6 +65,7 @@ sha512_block_data_order:
  tst w16,#(1<<6)
  b.ne .Lv8_entry
 
+.inst 0xd503233f
  stp x29,x30,[sp,#-128]!
  add x29,sp,#0
 
@@ -1026,6 +1025,7 @@ sha512_block_data_order:
  ldp x25,x26,[x29,#64]
  ldp x27,x28,[x29,#80]
  ldp x29,x30,[sp],#128
+.inst 0xd50323bf
  ret
 .size sha512_block_data_order,.-sha512_block_data_order
 
index f017bcd95d9dab8c9485d4234ec015cef4b2cd42..7acabf3f255b2925d237c2bac4f247f7892e8a35 100644 (file)
@@ -40,8 +40,6 @@
 # 1 "lib/accelerated/aarch64/macosx/aes-aarch64.s.tmp.S"
 # 1 "<built-in>"
 # 1 "<command-line>"
-# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3
-# 1 "<command-line>" 2
 # 1 "lib/accelerated/aarch64/macosx/aes-aarch64.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
 # 2 "lib/accelerated/aarch64/macosx/aes-aarch64.s.tmp.S" 2
@@ -226,6 +224,7 @@ Lenc_key_abort:
 
 .align 5
 _aes_v8_set_decrypt_key:
+.long 0xd503233f
  stp x29,x30,[sp,#-16]!
  add x29,sp,#0
  bl Lenc_key
@@ -259,6 +258,7 @@ Loop_imc:
  eor x0,x0,x0
 Ldec_key_abort:
  ldp x29,x30,[sp],#16
+.long 0xd50323bf
  ret
 
 .globl _aes_v8_encrypt
index f49a8dbb70c948bd9d84089a5112489d853f13ee..bf33773aa80cad11e445fecfefaf4914c610ef22 100644 (file)
 # 1 "lib/accelerated/aarch64/macosx/ghash-aarch64.s.tmp.S"
 # 1 "<built-in>"
 # 1 "<command-line>"
-# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3
-# 1 "<command-line>" 2
 # 1 "lib/accelerated/aarch64/macosx/ghash-aarch64.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
 # 2 "lib/accelerated/aarch64/macosx/ghash-aarch64.s.tmp.S" 2
 
+
 .text
 
 .globl _gcm_init_v8
@@ -193,7 +192,7 @@ _gcm_ghash_v8:
 
  subs x3,x3,#32
  mov x12,#16
-# 158 "lib/accelerated/aarch64/macosx/ghash-aarch64.s.tmp.S"
+# 159 "lib/accelerated/aarch64/macosx/ghash-aarch64.s.tmp.S"
  ld1 {v20.2d,v21.2d},[x1],#32
  movi v19.16b,#0xe1
  ld1 {v22.2d},[x1]
index 221dc11731caa41d68d667d0bf0b0e9709be03f7..8e1e12edf600cc21d1c8250a0659e4087521dd6c 100644 (file)
@@ -40,8 +40,6 @@
 # 1 "lib/accelerated/aarch64/macosx/sha1-armv8.s.tmp.S"
 # 1 "<built-in>"
 # 1 "<command-line>"
-# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3
-# 1 "<command-line>" 2
 # 1 "lib/accelerated/aarch64/macosx/sha1-armv8.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
 # 2 "lib/accelerated/aarch64/macosx/sha1-armv8.s.tmp.S" 2
index b48f6ca42a914ac0ef7a1e9486ba642747ed4610..fc6424975ccbcf9b39eabc64dc876a62257c6a34 100644 (file)
@@ -40,8 +40,6 @@
 # 1 "lib/accelerated/aarch64/macosx/sha256-armv8.s.tmp.S"
 # 1 "<built-in>"
 # 1 "<command-line>"
-# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3
-# 1 "<command-line>" 2
 # 1 "lib/accelerated/aarch64/macosx/sha256-armv8.s.tmp.S"
 # 56 "lib/accelerated/aarch64/macosx/sha256-armv8.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
@@ -69,6 +67,7 @@ _sha256_block_data_order:
  tst w16,#(1<<0)
  b.ne Lneon_entry
 
+.long 0xd503233f
  stp x29,x30,[sp,#-128]!
  add x29,sp,#0
 
@@ -1028,6 +1027,7 @@ Loop_16_xx:
  ldp x25,x26,[x29,#64]
  ldp x27,x28,[x29,#80]
  ldp x29,x30,[sp],#128
+.long 0xd50323bf
  ret
 
 
index 798619bc9a15b7a870f5a45b6440f6b54446521b..43af71fa48c3279700fe8dfa3e498e5e03726855 100644 (file)
@@ -40,8 +40,6 @@
 # 1 "lib/accelerated/aarch64/macosx/sha512-armv8.s.tmp.S"
 # 1 "<built-in>"
 # 1 "<command-line>"
-# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3
-# 1 "<command-line>" 2
 # 1 "lib/accelerated/aarch64/macosx/sha512-armv8.s.tmp.S"
 # 56 "lib/accelerated/aarch64/macosx/sha512-armv8.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
@@ -67,6 +65,7 @@ _sha512_block_data_order:
  tst w16,#(1<<6)
  b.ne Lv8_entry
 
+.long 0xd503233f
  stp x29,x30,[sp,#-128]!
  add x29,sp,#0
 
@@ -1026,6 +1025,7 @@ Loop_16_xx:
  ldp x25,x26,[x29,#64]
  ldp x27,x28,[x29,#80]
  ldp x29,x30,[sp],#128
+.long 0xd50323bf
  ret
 
 
index 6e6ea909733de85e8edae6b14c3607c79fb1ddbd..c58ea2359755bd5f0dd87abc5b3ec918798186d3 100644 (file)
@@ -5,12 +5,11 @@
 ## By Mike Hamburg (Stanford University), 2009
 ## Public domain.
 ##
-## For details see https://shiftleft.org/papers/vector_aes/ and
-## https://crypto.stanford.edu/vpaes/.
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
 #
 # *** This file is auto-generated ***
 #
-.file  "vpaes-x86.s"
 .text
 .align 64
 .L_vpaes_consts:
index 8c4a7d709d4dda97335fe3c6401fba0be55d11ed..150c9921d7c9ae0801a92b513f1b03374c01e5cc 100644 (file)
@@ -5,8 +5,8 @@
 ## By Mike Hamburg (Stanford University), 2009
 ## Public domain.
 ##
-## For details see https://shiftleft.org/papers/vector_aes/ and
-## https://crypto.stanford.edu/vpaes/.
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
 #
 # *** This file is auto-generated ***
 #
@@ -30,6 +30,7 @@
 .def   _vpaes_encrypt_core;    .scl 3; .type 32;       .endef
 .p2align       4
 _vpaes_encrypt_core:
+
        movq    %rdx,%r9
        movq    $16,%r11
        movl    240(%rdx),%eax
@@ -117,9 +118,11 @@ _vpaes_encrypt_core:
 
 
 
+
 .def   _vpaes_decrypt_core;    .scl 3; .type 32;       .endef
 .p2align       4
 _vpaes_decrypt_core:
+
        movq    %rdx,%r9
        movl    240(%rdx),%eax
        movdqa  %xmm9,%xmm1
@@ -223,6 +226,7 @@ _vpaes_decrypt_core:
 
 
 
+
 .def   _vpaes_schedule_core;   .scl 3; .type 32;       .endef
 .p2align       4
 _vpaes_schedule_core:
@@ -231,6 +235,7 @@ _vpaes_schedule_core:
 
 
 
+
        call    _vpaes_preheat
        movdqa  .Lk_rcon(%rip),%xmm8
        movdqu  (%rdi),%xmm0
@@ -408,9 +413,11 @@ _vpaes_schedule_core:
 
 
 
+
 .def   _vpaes_schedule_192_smear;      .scl 3; .type 32;       .endef
 .p2align       4
 _vpaes_schedule_192_smear:
+
        pshufd  $0x80,%xmm6,%xmm1
        pshufd  $0xFE,%xmm7,%xmm0
        pxor    %xmm1,%xmm6
@@ -438,11 +445,13 @@ _vpaes_schedule_192_smear:
 
 
 
+
 
 .def   _vpaes_schedule_round;  .scl 3; .type 32;       .endef
 .p2align       4
 _vpaes_schedule_round:
 
+
        pxor    %xmm1,%xmm1
 .byte  102,65,15,58,15,200,15
 .byte  102,69,15,58,15,192,15
@@ -506,9 +515,11 @@ _vpaes_schedule_low_round:
 
 
 
+
 .def   _vpaes_schedule_transform;      .scl 3; .type 32;       .endef
 .p2align       4
 _vpaes_schedule_transform:
+
        movdqa  %xmm9,%xmm1
        pandn   %xmm0,%xmm1
        psrld   $4,%xmm1
@@ -542,11 +553,13 @@ _vpaes_schedule_transform:
 
 
 
+
 
 
 .def   _vpaes_schedule_mangle; .scl 3; .type 32;       .endef
 .p2align       4
 _vpaes_schedule_mangle:
+
        movdqa  %xmm0,%xmm4
        movdqa  .Lk_mc_forward(%rip),%xmm5
        testq   %rcx,%rcx
@@ -616,6 +629,7 @@ _vpaes_schedule_mangle:
 
 
 
+
 .globl vpaes_set_encrypt_key
 .def   vpaes_set_encrypt_key;  .scl 2; .type 32;       .endef
 .p2align       4
@@ -628,6 +642,7 @@ vpaes_set_encrypt_key:
        movq    %rdx,%rsi
        movq    %r8,%rdx
 
+
        leaq    -184(%rsp),%rsp
        movaps  %xmm6,16(%rsp)
        movaps  %xmm7,32(%rsp)
@@ -664,6 +679,7 @@ vpaes_set_encrypt_key:
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
+
 .LSEH_end_vpaes_set_encrypt_key:
 
 .globl vpaes_set_decrypt_key
@@ -678,6 +694,7 @@ vpaes_set_decrypt_key:
        movq    %rdx,%rsi
        movq    %r8,%rdx
 
+
        leaq    -184(%rsp),%rsp
        movaps  %xmm6,16(%rsp)
        movaps  %xmm7,32(%rsp)
@@ -719,6 +736,7 @@ vpaes_set_decrypt_key:
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
+
 .LSEH_end_vpaes_set_decrypt_key:
 
 .globl vpaes_encrypt
@@ -733,6 +751,7 @@ vpaes_encrypt:
        movq    %rdx,%rsi
        movq    %r8,%rdx
 
+
        leaq    -184(%rsp),%rsp
        movaps  %xmm6,16(%rsp)
        movaps  %xmm7,32(%rsp)
@@ -764,6 +783,7 @@ vpaes_encrypt:
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
+
 .LSEH_end_vpaes_encrypt:
 
 .globl vpaes_decrypt
@@ -778,6 +798,7 @@ vpaes_decrypt:
        movq    %rdx,%rsi
        movq    %r8,%rdx
 
+
        leaq    -184(%rsp),%rsp
        movaps  %xmm6,16(%rsp)
        movaps  %xmm7,32(%rsp)
@@ -809,6 +830,7 @@ vpaes_decrypt:
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
+
 .LSEH_end_vpaes_decrypt:
 .globl vpaes_cbc_encrypt
 .def   vpaes_cbc_encrypt;      .scl 2; .type 32;       .endef
@@ -825,6 +847,7 @@ vpaes_cbc_encrypt:
        movq    40(%rsp),%r8
        movq    48(%rsp),%r9
 
+
        xchgq   %rcx,%rdx
        subq    $16,%rcx
        jc      .Lcbc_abort
@@ -886,6 +909,7 @@ vpaes_cbc_encrypt:
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
+
 .LSEH_end_vpaes_cbc_encrypt:
 
 
@@ -896,6 +920,7 @@ vpaes_cbc_encrypt:
 .def   _vpaes_preheat; .scl 3; .type 32;       .endef
 .p2align       4
 _vpaes_preheat:
+
        leaq    .Lk_s0F(%rip),%r10
        movdqa  -32(%r10),%xmm10
        movdqa  -16(%r10),%xmm11
@@ -912,6 +937,7 @@ _vpaes_preheat:
 
 
 
+
 .p2align       6
 _vpaes_consts:
 .Lk_inv:
index bc3554ca0770fe7b4bbaaaaef328bbce75b33955..7988004cb04b90e3f14005b7e3c7939c3c768759 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -365,17 +365,25 @@ aesni_gcm_decrypt:
        movq    40(%rsp),%r8
        movq    48(%rsp),%r9
 
+
        xorq    %r10,%r10
        cmpq    $0x60,%rdx
        jb      .Lgcm_dec_abort
 
        leaq    (%rsp),%rax
+
        pushq   %rbx
+
        pushq   %rbp
+
        pushq   %r12
+
        pushq   %r13
+
        pushq   %r14
+
        pushq   %r15
+
        leaq    -168(%rsp),%rsp
        movaps  %xmm6,-216(%rax)
        movaps  %xmm7,-200(%rax)
@@ -459,17 +467,25 @@ aesni_gcm_decrypt:
        movaps  -88(%rax),%xmm14
        movaps  -72(%rax),%xmm15
        movq    -48(%rax),%r15
+
        movq    -40(%rax),%r14
+
        movq    -32(%rax),%r13
+
        movq    -24(%rax),%r12
+
        movq    -16(%rax),%rbp
+
        movq    -8(%rax),%rbx
+
        leaq    (%rax),%rsp
+
 .Lgcm_dec_abort:
        movq    %r10,%rax
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
+
 .LSEH_end_aesni_gcm_decrypt:
 .def   _aesni_ctr32_6x;        .scl 3; .type 32;       .endef
 .p2align       5
@@ -577,17 +593,25 @@ aesni_gcm_encrypt:
        movq    40(%rsp),%r8
        movq    48(%rsp),%r9
 
+
        xorq    %r10,%r10
        cmpq    $288,%rdx
        jb      .Lgcm_enc_abort
 
        leaq    (%rsp),%rax
+
        pushq   %rbx
+
        pushq   %rbp
+
        pushq   %r12
+
        pushq   %r13
+
        pushq   %r14
+
        pushq   %r15
+
        leaq    -168(%rsp),%rsp
        movaps  %xmm6,-216(%rax)
        movaps  %xmm7,-200(%rax)
@@ -835,17 +859,25 @@ aesni_gcm_encrypt:
        movaps  -88(%rax),%xmm14
        movaps  -72(%rax),%xmm15
        movq    -48(%rax),%r15
+
        movq    -40(%rax),%r14
+
        movq    -32(%rax),%r13
+
        movq    -24(%rax),%r12
+
        movq    -16(%rax),%rbp
+
        movq    -8(%rax),%rbx
+
        leaq    (%rax),%rsp
+
 .Lgcm_enc_abort:
        movq    %r10,%rax
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
+
 .LSEH_end_aesni_gcm_encrypt:
 .p2align       6
 .Lbswap_mask:
index 502be77883cb8efb9c94a7f83b32109eba479c07..c6aa1a1e2a21217ecb8520ad838764c73ba8163c 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,6 @@
 #
 # *** This file is auto-generated ***
 #
-.file  "devel/perlasm/aesni-x86.s"
 .text
 .globl _aesni_encrypt
 .def   _aesni_encrypt; .scl    2;      .type   32;     .endef
@@ -60,7 +59,10 @@ _aesni_encrypt:
        leal    16(%edx),%edx
        jnz     .L000enc1_loop_1
 .byte  102,15,56,221,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%eax)
+       pxor    %xmm2,%xmm2
        ret
 .globl _aesni_decrypt
 .def   _aesni_decrypt; .scl    2;      .type   32;     .endef
@@ -83,31 +85,87 @@ _aesni_decrypt:
        leal    16(%edx),%edx
        jnz     .L001dec1_loop_2
 .byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%eax)
+       pxor    %xmm2,%xmm2
+       ret
+.def   __aesni_encrypt2;       .scl    3;      .type   32;     .endef
+.align 16
+__aesni_encrypt2:
+       movups  (%edx),%xmm0
+       shll    $4,%ecx
+       movups  16(%edx),%xmm1
+       xorps   %xmm0,%xmm2
+       pxor    %xmm0,%xmm3
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+       addl    $16,%ecx
+.L002enc2_loop:
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L002enc2_loop
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,221,208
+.byte  102,15,56,221,216
+       ret
+.def   __aesni_decrypt2;       .scl    3;      .type   32;     .endef
+.align 16
+__aesni_decrypt2:
+       movups  (%edx),%xmm0
+       shll    $4,%ecx
+       movups  16(%edx),%xmm1
+       xorps   %xmm0,%xmm2
+       pxor    %xmm0,%xmm3
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+       addl    $16,%ecx
+.L003dec2_loop:
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L003dec2_loop
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,223,208
+.byte  102,15,56,223,216
        ret
 .def   __aesni_encrypt3;       .scl    3;      .type   32;     .endef
 .align 16
 __aesni_encrypt3:
        movups  (%edx),%xmm0
-       shrl    $1,%ecx
+       shll    $4,%ecx
        movups  16(%edx),%xmm1
-       leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
        pxor    %xmm0,%xmm4
-       movups  (%edx),%xmm0
-.L002enc3_loop:
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+       addl    $16,%ecx
+.L004enc3_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       decl    %ecx
 .byte  102,15,56,220,225
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       leal    32(%edx),%edx
 .byte  102,15,56,220,224
-       movups  (%edx),%xmm0
-       jnz     .L002enc3_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L004enc3_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
@@ -119,25 +177,26 @@ __aesni_encrypt3:
 .align 16
 __aesni_decrypt3:
        movups  (%edx),%xmm0
-       shrl    $1,%ecx
+       shll    $4,%ecx
        movups  16(%edx),%xmm1
-       leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
        pxor    %xmm0,%xmm4
-       movups  (%edx),%xmm0
-.L003dec3_loop:
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+       addl    $16,%ecx
+.L005dec3_loop:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
-       decl    %ecx
 .byte  102,15,56,222,225
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,222,208
 .byte  102,15,56,222,216
-       leal    32(%edx),%edx
 .byte  102,15,56,222,224
-       movups  (%edx),%xmm0
-       jnz     .L003dec3_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L005dec3_loop
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
@@ -150,27 +209,29 @@ __aesni_decrypt3:
 __aesni_encrypt4:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
-       shrl    $1,%ecx
-       leal    32(%edx),%edx
+       shll    $4,%ecx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
        pxor    %xmm0,%xmm4
        pxor    %xmm0,%xmm5
-       movups  (%edx),%xmm0
-.L004enc4_loop:
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+.byte  15,31,64,0
+       addl    $16,%ecx
+.L006enc4_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       decl    %ecx
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       leal    32(%edx),%edx
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
-       movups  (%edx),%xmm0
-       jnz     .L004enc4_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L006enc4_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
@@ -185,27 +246,29 @@ __aesni_encrypt4:
 __aesni_decrypt4:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
-       shrl    $1,%ecx
-       leal    32(%edx),%edx
+       shll    $4,%ecx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
        pxor    %xmm0,%xmm4
        pxor    %xmm0,%xmm5
-       movups  (%edx),%xmm0
-.L005dec4_loop:
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+.byte  15,31,64,0
+       addl    $16,%ecx
+.L007dec4_loop:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
-       decl    %ecx
 .byte  102,15,56,222,225
 .byte  102,15,56,222,233
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,222,208
 .byte  102,15,56,222,216
-       leal    32(%edx),%edx
 .byte  102,15,56,222,224
 .byte  102,15,56,222,232
-       movups  (%edx),%xmm0
-       jnz     .L005dec4_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L007dec4_loop
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
@@ -219,45 +282,42 @@ __aesni_decrypt4:
 .align 16
 __aesni_encrypt6:
        movups  (%edx),%xmm0
-       shrl    $1,%ecx
+       shll    $4,%ecx
        movups  16(%edx),%xmm1
-       leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
-.byte  102,15,56,220,209
        pxor    %xmm0,%xmm4
-.byte  102,15,56,220,217
+.byte  102,15,56,220,209
        pxor    %xmm0,%xmm5
-       decl    %ecx
-.byte  102,15,56,220,225
        pxor    %xmm0,%xmm6
-.byte  102,15,56,220,233
+.byte  102,15,56,220,217
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+.byte  102,15,56,220,225
        pxor    %xmm0,%xmm7
-.byte  102,15,56,220,241
-       movups  (%edx),%xmm0
-.byte  102,15,56,220,249
-       jmp     .L_aesni_encrypt6_enter
+       movups  (%edx,%ecx,1),%xmm0
+       addl    $16,%ecx
+       jmp     .L008_aesni_encrypt6_inner
 .align 16
-.L006enc6_loop:
+.L009enc6_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       decl    %ecx
 .byte  102,15,56,220,225
+.L008_aesni_encrypt6_inner:
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
-.align 16
 .L_aesni_encrypt6_enter:
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       leal    32(%edx),%edx
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
 .byte  102,15,56,220,240
 .byte  102,15,56,220,248
-       movups  (%edx),%xmm0
-       jnz     .L006enc6_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L009enc6_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
@@ -275,45 +335,42 @@ __aesni_encrypt6:
 .align 16
 __aesni_decrypt6:
        movups  (%edx),%xmm0
-       shrl    $1,%ecx
+       shll    $4,%ecx
        movups  16(%edx),%xmm1
-       leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
-.byte  102,15,56,222,209
        pxor    %xmm0,%xmm4
-.byte  102,15,56,222,217
+.byte  102,15,56,222,209
        pxor    %xmm0,%xmm5
-       decl    %ecx
-.byte  102,15,56,222,225
        pxor    %xmm0,%xmm6
-.byte  102,15,56,222,233
+.byte  102,15,56,222,217
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+.byte  102,15,56,222,225
        pxor    %xmm0,%xmm7
-.byte  102,15,56,222,241
-       movups  (%edx),%xmm0
-.byte  102,15,56,222,249
-       jmp     .L_aesni_decrypt6_enter
+       movups  (%edx,%ecx,1),%xmm0
+       addl    $16,%ecx
+       jmp     .L010_aesni_decrypt6_inner
 .align 16
-.L007dec6_loop:
+.L011dec6_loop:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
-       decl    %ecx
 .byte  102,15,56,222,225
+.L010_aesni_decrypt6_inner:
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
 .byte  102,15,56,222,249
-.align 16
 .L_aesni_decrypt6_enter:
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,222,208
 .byte  102,15,56,222,216
-       leal    32(%edx),%edx
 .byte  102,15,56,222,224
 .byte  102,15,56,222,232
 .byte  102,15,56,222,240
 .byte  102,15,56,222,248
-       movups  (%edx),%xmm0
-       jnz     .L007dec6_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L011dec6_loop
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
@@ -342,14 +399,14 @@ _aesni_ecb_encrypt:
        movl    32(%esp),%edx
        movl    36(%esp),%ebx
        andl    $-16,%eax
-       jz      .L008ecb_ret
+       jz      .L012ecb_ret
        movl    240(%edx),%ecx
        testl   %ebx,%ebx
-       jz      .L009ecb_decrypt
+       jz      .L013ecb_decrypt
        movl    %edx,%ebp
        movl    %ecx,%ebx
        cmpl    $96,%eax
-       jb      .L010ecb_enc_tail
+       jb      .L014ecb_enc_tail
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -358,9 +415,9 @@ _aesni_ecb_encrypt:
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
        subl    $96,%eax
-       jmp     .L011ecb_enc_loop6_enter
+       jmp     .L015ecb_enc_loop6_enter
 .align 16
-.L012ecb_enc_loop6:
+.L016ecb_enc_loop6:
        movups  %xmm2,(%edi)
        movdqu  (%esi),%xmm2
        movups  %xmm3,16(%edi)
@@ -375,12 +432,12 @@ _aesni_ecb_encrypt:
        leal    96(%edi),%edi
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
-.L011ecb_enc_loop6_enter:
+.L015ecb_enc_loop6_enter:
        call    __aesni_encrypt6
        movl    %ebp,%edx
        movl    %ebx,%ecx
        subl    $96,%eax
-       jnc     .L012ecb_enc_loop6
+       jnc     .L016ecb_enc_loop6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
@@ -389,18 +446,18 @@ _aesni_ecb_encrypt:
        movups  %xmm7,80(%edi)
        leal    96(%edi),%edi
        addl    $96,%eax
-       jz      .L008ecb_ret
-.L010ecb_enc_tail:
+       jz      .L012ecb_ret
+.L014ecb_enc_tail:
        movups  (%esi),%xmm2
        cmpl    $32,%eax
-       jb      .L013ecb_enc_one
+       jb      .L017ecb_enc_one
        movups  16(%esi),%xmm3
-       je      .L014ecb_enc_two
+       je      .L018ecb_enc_two
        movups  32(%esi),%xmm4
        cmpl    $64,%eax
-       jb      .L015ecb_enc_three
+       jb      .L019ecb_enc_three
        movups  48(%esi),%xmm5
-       je      .L016ecb_enc_four
+       je      .L020ecb_enc_four
        movups  64(%esi),%xmm6
        xorps   %xmm7,%xmm7
        call    __aesni_encrypt6
@@ -409,50 +466,49 @@ _aesni_ecb_encrypt:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L013ecb_enc_one:
+.L017ecb_enc_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L017enc1_loop_3:
+.L021enc1_loop_3:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L017enc1_loop_3
+       jnz     .L021enc1_loop_3
 .byte  102,15,56,221,209
        movups  %xmm2,(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L014ecb_enc_two:
-       xorps   %xmm4,%xmm4
-       call    __aesni_encrypt3
+.L018ecb_enc_two:
+       call    __aesni_encrypt2
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L015ecb_enc_three:
+.L019ecb_enc_three:
        call    __aesni_encrypt3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L016ecb_enc_four:
+.L020ecb_enc_four:
        call    __aesni_encrypt4
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L009ecb_decrypt:
+.L013ecb_decrypt:
        movl    %edx,%ebp
        movl    %ecx,%ebx
        cmpl    $96,%eax
-       jb      .L018ecb_dec_tail
+       jb      .L022ecb_dec_tail
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -461,9 +517,9 @@ _aesni_ecb_encrypt:
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
        subl    $96,%eax
-       jmp     .L019ecb_dec_loop6_enter
+       jmp     .L023ecb_dec_loop6_enter
 .align 16
-.L020ecb_dec_loop6:
+.L024ecb_dec_loop6:
        movups  %xmm2,(%edi)
        movdqu  (%esi),%xmm2
        movups  %xmm3,16(%edi)
@@ -478,12 +534,12 @@ _aesni_ecb_encrypt:
        leal    96(%edi),%edi
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
-.L019ecb_dec_loop6_enter:
+.L023ecb_dec_loop6_enter:
        call    __aesni_decrypt6
        movl    %ebp,%edx
        movl    %ebx,%ecx
        subl    $96,%eax
-       jnc     .L020ecb_dec_loop6
+       jnc     .L024ecb_dec_loop6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
@@ -492,18 +548,18 @@ _aesni_ecb_encrypt:
        movups  %xmm7,80(%edi)
        leal    96(%edi),%edi
        addl    $96,%eax
-       jz      .L008ecb_ret
-.L018ecb_dec_tail:
+       jz      .L012ecb_ret
+.L022ecb_dec_tail:
        movups  (%esi),%xmm2
        cmpl    $32,%eax
-       jb      .L021ecb_dec_one
+       jb      .L025ecb_dec_one
        movups  16(%esi),%xmm3
-       je      .L022ecb_dec_two
+       je      .L026ecb_dec_two
        movups  32(%esi),%xmm4
        cmpl    $64,%eax
-       jb      .L023ecb_dec_three
+       jb      .L027ecb_dec_three
        movups  48(%esi),%xmm5
-       je      .L024ecb_dec_four
+       je      .L028ecb_dec_four
        movups  64(%esi),%xmm6
        xorps   %xmm7,%xmm7
        call    __aesni_decrypt6
@@ -512,44 +568,51 @@ _aesni_ecb_encrypt:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L021ecb_dec_one:
+.L025ecb_dec_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L025dec1_loop_4:
+.L029dec1_loop_4:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L025dec1_loop_4
+       jnz     .L029dec1_loop_4
 .byte  102,15,56,223,209
        movups  %xmm2,(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L022ecb_dec_two:
-       xorps   %xmm4,%xmm4
-       call    __aesni_decrypt3
+.L026ecb_dec_two:
+       call    __aesni_decrypt2
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L023ecb_dec_three:
+.L027ecb_dec_three:
        call    __aesni_decrypt3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L024ecb_dec_four:
+.L028ecb_dec_four:
        call    __aesni_decrypt4
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-.L008ecb_ret:
+.L012ecb_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -587,48 +650,56 @@ _aesni_ccm64_encrypt_blocks:
        movl    %ebp,20(%esp)
        movl    %ebp,24(%esp)
        movl    %ebp,28(%esp)
-       shrl    $1,%ecx
+       shll    $4,%ecx
+       movl    $16,%ebx
        leal    (%edx),%ebp
        movdqa  (%esp),%xmm5
        movdqa  %xmm7,%xmm2
-       movl    %ecx,%ebx
+       leal    32(%edx,%ecx,1),%edx
+       subl    %ecx,%ebx
 .byte  102,15,56,0,253
-.L026ccm64_enc_outer:
+.L030ccm64_enc_outer:
        movups  (%ebp),%xmm0
        movl    %ebx,%ecx
        movups  (%esi),%xmm6
        xorps   %xmm0,%xmm2
        movups  16(%ebp),%xmm1
        xorps   %xmm6,%xmm0
-       leal    32(%ebp),%edx
        xorps   %xmm0,%xmm3
-       movups  (%edx),%xmm0
-.L027ccm64_enc2_loop:
+       movups  32(%ebp),%xmm0
+.L031ccm64_enc2_loop:
 .byte  102,15,56,220,209
-       decl    %ecx
 .byte  102,15,56,220,217
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,220,208
-       leal    32(%edx),%edx
 .byte  102,15,56,220,216
-       movups  (%edx),%xmm0
-       jnz     .L027ccm64_enc2_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L031ccm64_enc2_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
        paddq   16(%esp),%xmm7
+       decl    %eax
 .byte  102,15,56,221,208
 .byte  102,15,56,221,216
-       decl    %eax
        leal    16(%esi),%esi
        xorps   %xmm2,%xmm6
        movdqa  %xmm7,%xmm2
        movups  %xmm6,(%edi)
-       leal    16(%edi),%edi
 .byte  102,15,56,0,213
-       jnz     .L026ccm64_enc_outer
+       leal    16(%edi),%edi
+       jnz     .L030ccm64_enc_outer
        movl    48(%esp),%esp
        movl    40(%esp),%edi
        movups  %xmm3,(%edi)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -675,71 +746,82 @@ _aesni_ccm64_decrypt_blocks:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L028enc1_loop_5:
+.L032enc1_loop_5:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L028enc1_loop_5
+       jnz     .L032enc1_loop_5
 .byte  102,15,56,221,209
+       shll    $4,%ebx
+       movl    $16,%ecx
        movups  (%esi),%xmm6
        paddq   16(%esp),%xmm7
        leal    16(%esi),%esi
-       jmp     .L029ccm64_dec_outer
+       subl    %ebx,%ecx
+       leal    32(%ebp,%ebx,1),%edx
+       movl    %ecx,%ebx
+       jmp     .L033ccm64_dec_outer
 .align 16
-.L029ccm64_dec_outer:
+.L033ccm64_dec_outer:
        xorps   %xmm2,%xmm6
        movdqa  %xmm7,%xmm2
-       movl    %ebx,%ecx
        movups  %xmm6,(%edi)
        leal    16(%edi),%edi
 .byte  102,15,56,0,213
        subl    $1,%eax
-       jz      .L030ccm64_dec_break
+       jz      .L034ccm64_dec_break
        movups  (%ebp),%xmm0
-       shrl    $1,%ecx
+       movl    %ebx,%ecx
        movups  16(%ebp),%xmm1
        xorps   %xmm0,%xmm6
-       leal    32(%ebp),%edx
        xorps   %xmm0,%xmm2
        xorps   %xmm6,%xmm3
-       movups  (%edx),%xmm0
-.L031ccm64_dec2_loop:
+       movups  32(%ebp),%xmm0
+.L035ccm64_dec2_loop:
 .byte  102,15,56,220,209
-       decl    %ecx
 .byte  102,15,56,220,217
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,220,208
-       leal    32(%edx),%edx
 .byte  102,15,56,220,216
-       movups  (%edx),%xmm0
-       jnz     .L031ccm64_dec2_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L035ccm64_dec2_loop
        movups  (%esi),%xmm6
        paddq   16(%esp),%xmm7
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       leal    16(%esi),%esi
 .byte  102,15,56,221,208
 .byte  102,15,56,221,216
-       jmp     .L029ccm64_dec_outer
+       leal    16(%esi),%esi
+       jmp     .L033ccm64_dec_outer
 .align 16
-.L030ccm64_dec_break:
+.L034ccm64_dec_break:
+       movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        xorps   %xmm0,%xmm6
        leal    32(%edx),%edx
        xorps   %xmm6,%xmm3
-.L032enc1_loop_6:
+.L036enc1_loop_6:
 .byte  102,15,56,220,217
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L032enc1_loop_6
+       jnz     .L036enc1_loop_6
 .byte  102,15,56,221,217
        movl    48(%esp),%esp
        movl    40(%esp),%edi
        movups  %xmm3,(%edi)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -764,7 +846,7 @@ _aesni_ctr32_encrypt_blocks:
        andl    $-16,%esp
        movl    %ebp,80(%esp)
        cmpl    $1,%eax
-       je      .L033ctr32_one_shortcut
+       je      .L037ctr32_one_shortcut
        movdqu  (%ebx),%xmm7
        movl    $202182159,(%esp)
        movl    $134810123,4(%esp)
@@ -780,63 +862,59 @@ _aesni_ctr32_encrypt_blocks:
 .byte  102,15,58,34,253,3
        movl    240(%edx),%ecx
        bswap   %ebx
-       pxor    %xmm1,%xmm1
        pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movdqa  (%esp),%xmm2
-.byte  102,15,58,34,203,0
+.byte  102,15,58,34,195,0
        leal    3(%ebx),%ebp
-.byte  102,15,58,34,197,0
+.byte  102,15,58,34,205,0
        incl    %ebx
-.byte  102,15,58,34,203,1
+.byte  102,15,58,34,195,1
        incl    %ebp
-.byte  102,15,58,34,197,1
+.byte  102,15,58,34,205,1
        incl    %ebx
-.byte  102,15,58,34,203,2
+.byte  102,15,58,34,195,2
        incl    %ebp
-.byte  102,15,58,34,197,2
-       movdqa  %xmm1,48(%esp)
-.byte  102,15,56,0,202
-       movdqa  %xmm0,64(%esp)
+.byte  102,15,58,34,205,2
+       movdqa  %xmm0,48(%esp)
 .byte  102,15,56,0,194
-       pshufd  $192,%xmm1,%xmm2
-       pshufd  $128,%xmm1,%xmm3
+       movdqu  (%edx),%xmm6
+       movdqa  %xmm1,64(%esp)
+.byte  102,15,56,0,202
+       pshufd  $192,%xmm0,%xmm2
+       pshufd  $128,%xmm0,%xmm3
        cmpl    $6,%eax
-       jb      .L034ctr32_tail
+       jb      .L038ctr32_tail
+       pxor    %xmm6,%xmm7
+       shll    $4,%ecx
+       movl    $16,%ebx
        movdqa  %xmm7,32(%esp)
-       shrl    $1,%ecx
        movl    %edx,%ebp
-       movl    %ecx,%ebx
+       subl    %ecx,%ebx
+       leal    32(%edx,%ecx,1),%edx
        subl    $6,%eax
-       jmp     .L035ctr32_loop6
-.align 16
-.L035ctr32_loop6:
-       pshufd  $64,%xmm1,%xmm4
-       movdqa  32(%esp),%xmm1
-       pshufd  $192,%xmm0,%xmm5
-       por     %xmm1,%xmm2
-       pshufd  $128,%xmm0,%xmm6
-       por     %xmm1,%xmm3
-       pshufd  $64,%xmm0,%xmm7
-       por     %xmm1,%xmm4
-       por     %xmm1,%xmm5
-       por     %xmm1,%xmm6
-       por     %xmm1,%xmm7
-       movups  (%ebp),%xmm0
-       movups  16(%ebp),%xmm1
-       leal    32(%ebp),%edx
-       decl    %ecx
+       jmp     .L039ctr32_loop6
+.align 16
+.L039ctr32_loop6:
+       pshufd  $64,%xmm0,%xmm4
+       movdqa  32(%esp),%xmm0
+       pshufd  $192,%xmm1,%xmm5
        pxor    %xmm0,%xmm2
+       pshufd  $128,%xmm1,%xmm6
        pxor    %xmm0,%xmm3
-.byte  102,15,56,220,209
+       pshufd  $64,%xmm1,%xmm7
+       movups  16(%ebp),%xmm1
        pxor    %xmm0,%xmm4
-.byte  102,15,56,220,217
        pxor    %xmm0,%xmm5
-.byte  102,15,56,220,225
+.byte  102,15,56,220,209
        pxor    %xmm0,%xmm6
-.byte  102,15,56,220,233
        pxor    %xmm0,%xmm7
+.byte  102,15,56,220,217
+       movups  32(%ebp),%xmm0
+       movl    %ebx,%ecx
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
 .byte  102,15,56,220,241
-       movups  (%edx),%xmm0
 .byte  102,15,56,220,249
        call    .L_aesni_encrypt6_enter
        movups  (%esi),%xmm1
@@ -847,51 +925,51 @@ _aesni_ctr32_encrypt_blocks:
        movups  %xmm2,(%edi)
        movdqa  16(%esp),%xmm0
        xorps   %xmm1,%xmm4
-       movdqa  48(%esp),%xmm1
+       movdqa  64(%esp),%xmm1
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
        paddd   %xmm0,%xmm1
-       paddd   64(%esp),%xmm0
+       paddd   48(%esp),%xmm0
        movdqa  (%esp),%xmm2
        movups  48(%esi),%xmm3
        movups  64(%esi),%xmm4
        xorps   %xmm3,%xmm5
        movups  80(%esi),%xmm3
        leal    96(%esi),%esi
-       movdqa  %xmm1,48(%esp)
-.byte  102,15,56,0,202
+       movdqa  %xmm0,48(%esp)
+.byte  102,15,56,0,194
        xorps   %xmm4,%xmm6
        movups  %xmm5,48(%edi)
        xorps   %xmm3,%xmm7
-       movdqa  %xmm0,64(%esp)
-.byte  102,15,56,0,194
+       movdqa  %xmm1,64(%esp)
+.byte  102,15,56,0,202
        movups  %xmm6,64(%edi)
-       pshufd  $192,%xmm1,%xmm2
+       pshufd  $192,%xmm0,%xmm2
        movups  %xmm7,80(%edi)
        leal    96(%edi),%edi
-       movl    %ebx,%ecx
-       pshufd  $128,%xmm1,%xmm3
+       pshufd  $128,%xmm0,%xmm3
        subl    $6,%eax
-       jnc     .L035ctr32_loop6
+       jnc     .L039ctr32_loop6
        addl    $6,%eax
-       jz      .L036ctr32_ret
+       jz      .L040ctr32_ret
+       movdqu  (%ebp),%xmm7
        movl    %ebp,%edx
-       leal    1(,%ecx,2),%ecx
-       movdqa  32(%esp),%xmm7
-.L034ctr32_tail:
+       pxor    32(%esp),%xmm7
+       movl    240(%ebp),%ecx
+.L038ctr32_tail:
        por     %xmm7,%xmm2
        cmpl    $2,%eax
-       jb      .L037ctr32_one
-       pshufd  $64,%xmm1,%xmm4
+       jb      .L041ctr32_one
+       pshufd  $64,%xmm0,%xmm4
        por     %xmm7,%xmm3
-       je      .L038ctr32_two
-       pshufd  $192,%xmm0,%xmm5
+       je      .L042ctr32_two
+       pshufd  $192,%xmm1,%xmm5
        por     %xmm7,%xmm4
        cmpl    $4,%eax
-       jb      .L039ctr32_three
-       pshufd  $128,%xmm0,%xmm6
+       jb      .L043ctr32_three
+       pshufd  $128,%xmm1,%xmm6
        por     %xmm7,%xmm5
-       je      .L040ctr32_four
+       je      .L044ctr32_four
        por     %xmm7,%xmm6
        call    __aesni_encrypt6
        movups  (%esi),%xmm1
@@ -909,39 +987,39 @@ _aesni_ctr32_encrypt_blocks:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     .L036ctr32_ret
+       jmp     .L040ctr32_ret
 .align 16
-.L033ctr32_one_shortcut:
+.L037ctr32_one_shortcut:
        movups  (%ebx),%xmm2
        movl    240(%edx),%ecx
-.L037ctr32_one:
+.L041ctr32_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L041enc1_loop_7:
+.L045enc1_loop_7:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L041enc1_loop_7
+       jnz     .L045enc1_loop_7
 .byte  102,15,56,221,209
        movups  (%esi),%xmm6
        xorps   %xmm2,%xmm6
        movups  %xmm6,(%edi)
-       jmp     .L036ctr32_ret
+       jmp     .L040ctr32_ret
 .align 16
-.L038ctr32_two:
-       call    __aesni_encrypt3
+.L042ctr32_two:
+       call    __aesni_encrypt2
        movups  (%esi),%xmm5
        movups  16(%esi),%xmm6
        xorps   %xmm5,%xmm2
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     .L036ctr32_ret
+       jmp     .L040ctr32_ret
 .align 16
-.L039ctr32_three:
+.L043ctr32_three:
        call    __aesni_encrypt3
        movups  (%esi),%xmm5
        movups  16(%esi),%xmm6
@@ -952,9 +1030,9 @@ _aesni_ctr32_encrypt_blocks:
        xorps   %xmm7,%xmm4
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     .L036ctr32_ret
+       jmp     .L040ctr32_ret
 .align 16
-.L040ctr32_four:
+.L044ctr32_four:
        call    __aesni_encrypt4
        movups  (%esi),%xmm6
        movups  16(%esi),%xmm7
@@ -968,7 +1046,18 @@ _aesni_ctr32_encrypt_blocks:
        xorps   %xmm0,%xmm5
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-.L036ctr32_ret:
+.L040ctr32_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
        movl    80(%esp),%esp
        popl    %edi
        popl    %esi
@@ -992,12 +1081,12 @@ _aesni_xts_encrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L042enc1_loop_8:
+.L046enc1_loop_8:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L042enc1_loop_8
+       jnz     .L046enc1_loop_8
 .byte  102,15,56,221,209
        movl    20(%esp),%esi
        movl    24(%esp),%edi
@@ -1021,12 +1110,14 @@ _aesni_xts_encrypt:
        movl    %edx,%ebp
        movl    %ecx,%ebx
        subl    $96,%eax
-       jc      .L043xts_enc_short
-       shrl    $1,%ecx
-       movl    %ecx,%ebx
-       jmp     .L044xts_enc_loop6
+       jc      .L047xts_enc_short
+       shll    $4,%ecx
+       movl    $16,%ebx
+       subl    %ecx,%ebx
+       leal    32(%edx,%ecx,1),%edx
+       jmp     .L048xts_enc_loop6
 .align 16
-.L044xts_enc_loop6:
+.L048xts_enc_loop6:
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,(%esp)
@@ -1062,6 +1153,7 @@ _aesni_xts_encrypt:
        pand    %xmm3,%xmm7
        movups  (%esi),%xmm2
        pxor    %xmm1,%xmm7
+       movl    %ebx,%ecx
        movdqu  16(%esi),%xmm3
        xorps   %xmm0,%xmm2
        movdqu  32(%esi),%xmm4
@@ -1077,19 +1169,17 @@ _aesni_xts_encrypt:
        movdqa  %xmm7,80(%esp)
        pxor    %xmm1,%xmm7
        movups  16(%ebp),%xmm1
-       leal    32(%ebp),%edx
        pxor    16(%esp),%xmm3
-.byte  102,15,56,220,209
        pxor    32(%esp),%xmm4
-.byte  102,15,56,220,217
+.byte  102,15,56,220,209
        pxor    48(%esp),%xmm5
-       decl    %ecx
-.byte  102,15,56,220,225
        pxor    64(%esp),%xmm6
-.byte  102,15,56,220,233
+.byte  102,15,56,220,217
        pxor    %xmm0,%xmm7
+       movups  32(%ebp),%xmm0
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
 .byte  102,15,56,220,241
-       movups  (%edx),%xmm0
 .byte  102,15,56,220,249
        call    .L_aesni_encrypt6_enter
        movdqa  80(%esp),%xmm1
@@ -1114,26 +1204,25 @@ _aesni_xts_encrypt:
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
-       movl    %ebx,%ecx
        pxor    %xmm2,%xmm1
        subl    $96,%eax
-       jnc     .L044xts_enc_loop6
-       leal    1(,%ecx,2),%ecx
+       jnc     .L048xts_enc_loop6
+       movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movl    %ecx,%ebx
-.L043xts_enc_short:
+.L047xts_enc_short:
        addl    $96,%eax
-       jz      .L045xts_enc_done6x
+       jz      .L049xts_enc_done6x
        movdqa  %xmm1,%xmm5
        cmpl    $32,%eax
-       jb      .L046xts_enc_one
+       jb      .L050xts_enc_one
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-       je      .L047xts_enc_two
+       je      .L051xts_enc_two
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm6
@@ -1142,7 +1231,7 @@ _aesni_xts_encrypt:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        cmpl    $64,%eax
-       jb      .L048xts_enc_three
+       jb      .L052xts_enc_three
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm7
@@ -1152,7 +1241,7 @@ _aesni_xts_encrypt:
        pxor    %xmm2,%xmm1
        movdqa  %xmm5,(%esp)
        movdqa  %xmm6,16(%esp)
-       je      .L049xts_enc_four
+       je      .L053xts_enc_four
        movdqa  %xmm7,32(%esp)
        pshufd  $19,%xmm0,%xmm7
        movdqa  %xmm1,48(%esp)
@@ -1184,9 +1273,9 @@ _aesni_xts_encrypt:
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
-       jmp     .L050xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L046xts_enc_one:
+.L050xts_enc_one:
        movups  (%esi),%xmm2
        leal    16(%esi),%esi
        xorps   %xmm5,%xmm2
@@ -1194,37 +1283,36 @@ _aesni_xts_encrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L051enc1_loop_9:
+.L055enc1_loop_9:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L051enc1_loop_9
+       jnz     .L055enc1_loop_9
 .byte  102,15,56,221,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        movdqa  %xmm5,%xmm1
-       jmp     .L050xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L047xts_enc_two:
+.L051xts_enc_two:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
        leal    32(%esi),%esi
        xorps   %xmm5,%xmm2
        xorps   %xmm6,%xmm3
-       xorps   %xmm4,%xmm4
-       call    __aesni_encrypt3
+       call    __aesni_encrypt2
        xorps   %xmm5,%xmm2
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        leal    32(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     .L050xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L048xts_enc_three:
+.L052xts_enc_three:
        movaps  %xmm1,%xmm7
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1242,9 +1330,9 @@ _aesni_xts_encrypt:
        movups  %xmm4,32(%edi)
        leal    48(%edi),%edi
        movdqa  %xmm7,%xmm1
-       jmp     .L050xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L049xts_enc_four:
+.L053xts_enc_four:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1266,28 +1354,28 @@ _aesni_xts_encrypt:
        movups  %xmm5,48(%edi)
        leal    64(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     .L050xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L045xts_enc_done6x:
+.L049xts_enc_done6x:
        movl    112(%esp),%eax
        andl    $15,%eax
-       jz      .L052xts_enc_ret
+       jz      .L056xts_enc_ret
        movdqa  %xmm1,%xmm5
        movl    %eax,112(%esp)
-       jmp     .L053xts_enc_steal
+       jmp     .L057xts_enc_steal
 .align 16
-.L050xts_enc_done:
+.L054xts_enc_done:
        movl    112(%esp),%eax
        pxor    %xmm0,%xmm0
        andl    $15,%eax
-       jz      .L052xts_enc_ret
+       jz      .L056xts_enc_ret
        pcmpgtd %xmm1,%xmm0
        movl    %eax,112(%esp)
        pshufd  $19,%xmm0,%xmm5
        paddq   %xmm1,%xmm1
        pand    96(%esp),%xmm5
        pxor    %xmm1,%xmm5
-.L053xts_enc_steal:
+.L057xts_enc_steal:
        movzbl  (%esi),%ecx
        movzbl  -16(%edi),%edx
        leal    1(%esi),%esi
@@ -1295,7 +1383,7 @@ _aesni_xts_encrypt:
        movb    %dl,(%edi)
        leal    1(%edi),%edi
        subl    $1,%eax
-       jnz     .L053xts_enc_steal
+       jnz     .L057xts_enc_steal
        subl    112(%esp),%edi
        movl    %ebp,%edx
        movl    %ebx,%ecx
@@ -1305,16 +1393,30 @@ _aesni_xts_encrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L054enc1_loop_10:
+.L058enc1_loop_10:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L054enc1_loop_10
+       jnz     .L058enc1_loop_10
 .byte  102,15,56,221,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,-16(%edi)
-.L052xts_enc_ret:
+.L056xts_enc_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       movdqa  %xmm0,(%esp)
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm0,16(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm0,80(%esp)
        movl    116(%esp),%esp
        popl    %edi
        popl    %esi
@@ -1338,12 +1440,12 @@ _aesni_xts_decrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L055enc1_loop_11:
+.L059enc1_loop_11:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L055enc1_loop_11
+       jnz     .L059enc1_loop_11
 .byte  102,15,56,221,209
        movl    20(%esp),%esi
        movl    24(%esp),%edi
@@ -1372,12 +1474,14 @@ _aesni_xts_decrypt:
        pcmpgtd %xmm1,%xmm0
        andl    $-16,%eax
        subl    $96,%eax
-       jc      .L056xts_dec_short
-       shrl    $1,%ecx
-       movl    %ecx,%ebx
-       jmp     .L057xts_dec_loop6
+       jc      .L060xts_dec_short
+       shll    $4,%ecx
+       movl    $16,%ebx
+       subl    %ecx,%ebx
+       leal    32(%edx,%ecx,1),%edx
+       jmp     .L061xts_dec_loop6
 .align 16
-.L057xts_dec_loop6:
+.L061xts_dec_loop6:
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,(%esp)
@@ -1413,6 +1517,7 @@ _aesni_xts_decrypt:
        pand    %xmm3,%xmm7
        movups  (%esi),%xmm2
        pxor    %xmm1,%xmm7
+       movl    %ebx,%ecx
        movdqu  16(%esi),%xmm3
        xorps   %xmm0,%xmm2
        movdqu  32(%esi),%xmm4
@@ -1428,19 +1533,17 @@ _aesni_xts_decrypt:
        movdqa  %xmm7,80(%esp)
        pxor    %xmm1,%xmm7
        movups  16(%ebp),%xmm1
-       leal    32(%ebp),%edx
        pxor    16(%esp),%xmm3
-.byte  102,15,56,222,209
        pxor    32(%esp),%xmm4
-.byte  102,15,56,222,217
+.byte  102,15,56,222,209
        pxor    48(%esp),%xmm5
-       decl    %ecx
-.byte  102,15,56,222,225
        pxor    64(%esp),%xmm6
-.byte  102,15,56,222,233
+.byte  102,15,56,222,217
        pxor    %xmm0,%xmm7
+       movups  32(%ebp),%xmm0
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
 .byte  102,15,56,222,241
-       movups  (%edx),%xmm0
 .byte  102,15,56,222,249
        call    .L_aesni_decrypt6_enter
        movdqa  80(%esp),%xmm1
@@ -1465,26 +1568,25 @@ _aesni_xts_decrypt:
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
-       movl    %ebx,%ecx
        pxor    %xmm2,%xmm1
        subl    $96,%eax
-       jnc     .L057xts_dec_loop6
-       leal    1(,%ecx,2),%ecx
+       jnc     .L061xts_dec_loop6
+       movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movl    %ecx,%ebx
-.L056xts_dec_short:
+.L060xts_dec_short:
        addl    $96,%eax
-       jz      .L058xts_dec_done6x
+       jz      .L062xts_dec_done6x
        movdqa  %xmm1,%xmm5
        cmpl    $32,%eax
-       jb      .L059xts_dec_one
+       jb      .L063xts_dec_one
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-       je      .L060xts_dec_two
+       je      .L064xts_dec_two
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm6
@@ -1493,7 +1595,7 @@ _aesni_xts_decrypt:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        cmpl    $64,%eax
-       jb      .L061xts_dec_three
+       jb      .L065xts_dec_three
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm7
@@ -1503,7 +1605,7 @@ _aesni_xts_decrypt:
        pxor    %xmm2,%xmm1
        movdqa  %xmm5,(%esp)
        movdqa  %xmm6,16(%esp)
-       je      .L062xts_dec_four
+       je      .L066xts_dec_four
        movdqa  %xmm7,32(%esp)
        pshufd  $19,%xmm0,%xmm7
        movdqa  %xmm1,48(%esp)
@@ -1535,9 +1637,9 @@ _aesni_xts_decrypt:
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
-       jmp     .L063xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L059xts_dec_one:
+.L063xts_dec_one:
        movups  (%esi),%xmm2
        leal    16(%esi),%esi
        xorps   %xmm5,%xmm2
@@ -1545,36 +1647,36 @@ _aesni_xts_decrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L064dec1_loop_12:
+.L068dec1_loop_12:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L064dec1_loop_12
+       jnz     .L068dec1_loop_12
 .byte  102,15,56,223,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        movdqa  %xmm5,%xmm1
-       jmp     .L063xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L060xts_dec_two:
+.L064xts_dec_two:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
        leal    32(%esi),%esi
        xorps   %xmm5,%xmm2
        xorps   %xmm6,%xmm3
-       call    __aesni_decrypt3
+       call    __aesni_decrypt2
        xorps   %xmm5,%xmm2
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        leal    32(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     .L063xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L061xts_dec_three:
+.L065xts_dec_three:
        movaps  %xmm1,%xmm7
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1592,9 +1694,9 @@ _aesni_xts_decrypt:
        movups  %xmm4,32(%edi)
        leal    48(%edi),%edi
        movdqa  %xmm7,%xmm1
-       jmp     .L063xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L062xts_dec_four:
+.L066xts_dec_four:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1616,20 +1718,20 @@ _aesni_xts_decrypt:
        movups  %xmm5,48(%edi)
        leal    64(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     .L063xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L058xts_dec_done6x:
+.L062xts_dec_done6x:
        movl    112(%esp),%eax
        andl    $15,%eax
-       jz      .L065xts_dec_ret
+       jz      .L069xts_dec_ret
        movl    %eax,112(%esp)
-       jmp     .L066xts_dec_only_one_more
+       jmp     .L070xts_dec_only_one_more
 .align 16
-.L063xts_dec_done:
+.L067xts_dec_done:
        movl    112(%esp),%eax
        pxor    %xmm0,%xmm0
        andl    $15,%eax
-       jz      .L065xts_dec_ret
+       jz      .L069xts_dec_ret
        pcmpgtd %xmm1,%xmm0
        movl    %eax,112(%esp)
        pshufd  $19,%xmm0,%xmm2
@@ -1639,7 +1741,7 @@ _aesni_xts_decrypt:
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-.L066xts_dec_only_one_more:
+.L070xts_dec_only_one_more:
        pshufd  $19,%xmm0,%xmm5
        movdqa  %xmm1,%xmm6
        paddq   %xmm1,%xmm1
@@ -1653,16 +1755,16 @@ _aesni_xts_decrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L067dec1_loop_13:
+.L071dec1_loop_13:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L067dec1_loop_13
+       jnz     .L071dec1_loop_13
 .byte  102,15,56,223,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
-.L068xts_dec_steal:
+.L072xts_dec_steal:
        movzbl  16(%esi),%ecx
        movzbl  (%edi),%edx
        leal    1(%esi),%esi
@@ -1670,7 +1772,7 @@ _aesni_xts_decrypt:
        movb    %dl,16(%edi)
        leal    1(%edi),%edi
        subl    $1,%eax
-       jnz     .L068xts_dec_steal
+       jnz     .L072xts_dec_steal
        subl    112(%esp),%edi
        movl    %ebp,%edx
        movl    %ebx,%ecx
@@ -1680,105 +1782,908 @@ _aesni_xts_decrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L069dec1_loop_14:
+.L073dec1_loop_14:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L069dec1_loop_14
+       jnz     .L073dec1_loop_14
 .byte  102,15,56,223,209
        xorps   %xmm6,%xmm2
        movups  %xmm2,(%edi)
-.L065xts_dec_ret:
+.L069xts_dec_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       movdqa  %xmm0,(%esp)
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm0,16(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm0,80(%esp)
        movl    116(%esp),%esp
        popl    %edi
        popl    %esi
        popl    %ebx
        popl    %ebp
        ret
-.globl _aesni_cbc_encrypt
-.def   _aesni_cbc_encrypt;     .scl    2;      .type   32;     .endef
+.globl _aesni_ocb_encrypt
+.def   _aesni_ocb_encrypt;     .scl    2;      .type   32;     .endef
 .align 16
-_aesni_cbc_encrypt:
-.L_aesni_cbc_encrypt_begin:
+_aesni_ocb_encrypt:
+.L_aesni_ocb_encrypt_begin:
        pushl   %ebp
        pushl   %ebx
        pushl   %esi
        pushl   %edi
+       movl    40(%esp),%ecx
+       movl    48(%esp),%ebx
        movl    20(%esp),%esi
-       movl    %esp,%ebx
        movl    24(%esp),%edi
-       subl    $24,%ebx
        movl    28(%esp),%eax
-       andl    $-16,%ebx
        movl    32(%esp),%edx
+       movdqu  (%ecx),%xmm0
        movl    36(%esp),%ebp
-       testl   %eax,%eax
-       jz      .L070cbc_abort
-       cmpl    $0,40(%esp)
-       xchgl   %esp,%ebx
-       movups  (%ebp),%xmm7
+       movdqu  (%ebx),%xmm1
+       movl    44(%esp),%ebx
+       movl    %esp,%ecx
+       subl    $132,%esp
+       andl    $-16,%esp
+       subl    %esi,%edi
+       shll    $4,%eax
+       leal    -96(%esi,%eax,1),%eax
+       movl    %edi,120(%esp)
+       movl    %eax,124(%esp)
+       movl    %ecx,128(%esp)
        movl    240(%edx),%ecx
-       movl    %edx,%ebp
-       movl    %ebx,16(%esp)
-       movl    %ecx,%ebx
-       je      .L071cbc_decrypt
-       movaps  %xmm7,%xmm2
-       cmpl    $16,%eax
-       jb      .L072cbc_enc_tail
-       subl    $16,%eax
-       jmp     .L073cbc_enc_loop
-.align 16
-.L073cbc_enc_loop:
-       movups  (%esi),%xmm7
+       testl   $1,%ebp
+       jnz     .L074odd
+       bsfl    %ebp,%eax
+       addl    $1,%ebp
+       shll    $4,%eax
+       movdqu  (%ebx,%eax,1),%xmm7
+       movl    %edx,%eax
+       movdqu  (%esi),%xmm2
        leal    16(%esi),%esi
+       pxor    %xmm0,%xmm7
+       pxor    %xmm2,%xmm1
+       pxor    %xmm7,%xmm2
+       movdqa  %xmm1,%xmm6
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
-       xorps   %xmm0,%xmm7
        leal    32(%edx),%edx
-       xorps   %xmm7,%xmm2
-.L074enc1_loop_15:
+       xorps   %xmm0,%xmm2
+.L075enc1_loop_15:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L074enc1_loop_15
+       jnz     .L075enc1_loop_15
 .byte  102,15,56,221,209
-       movl    %ebx,%ecx
-       movl    %ebp,%edx
-       movups  %xmm2,(%edi)
-       leal    16(%edi),%edi
-       subl    $16,%eax
-       jnc     .L073cbc_enc_loop
-       addl    $16,%eax
-       jnz     .L072cbc_enc_tail
-       movaps  %xmm2,%xmm7
-       jmp     .L075cbc_ret
-.L072cbc_enc_tail:
-       movl    %eax,%ecx
-.long  2767451785
-       movl    $16,%ecx
-       subl    %eax,%ecx
-       xorl    %eax,%eax
-.long  2868115081
-       leal    -16(%edi),%edi
-       movl    %ebx,%ecx
-       movl    %edi,%esi
-       movl    %ebp,%edx
-       jmp     .L073cbc_enc_loop
-.align 16
-.L071cbc_decrypt:
-       cmpl    $80,%eax
-       jbe     .L076cbc_dec_tail
-       movaps  %xmm7,(%esp)
-       subl    $80,%eax
-       jmp     .L077cbc_dec_loop6_enter
-.align 16
-.L078cbc_dec_loop6:
-       movaps  %xmm0,(%esp)
-       movups  %xmm7,(%edi)
+       xorps   %xmm7,%xmm2
+       movdqa  %xmm7,%xmm0
+       movdqa  %xmm6,%xmm1
+       movups  %xmm2,-16(%edi,%esi,1)
+       movl    240(%eax),%ecx
+       movl    %eax,%edx
+       movl    124(%esp),%eax
+.L074odd:
+       shll    $4,%ecx
+       movl    $16,%edi
+       subl    %ecx,%edi
+       movl    %edx,112(%esp)
+       leal    32(%edx,%ecx,1),%edx
+       movl    %edi,116(%esp)
+       cmpl    %eax,%esi
+       ja      .L076short
+       jmp     .L077grandloop
+.align 32
+.L077grandloop:
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       leal    5(%ebp),%edi
+       addl    $6,%ebp
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       bsfl    %edi,%edi
+       shll    $4,%ecx
+       shll    $4,%eax
+       shll    $4,%edi
+       movdqu  (%ebx),%xmm2
+       movdqu  (%ebx,%ecx,1),%xmm3
+       movl    116(%esp),%ecx
+       movdqa  %xmm2,%xmm4
+       movdqu  (%ebx,%eax,1),%xmm5
+       movdqa  %xmm2,%xmm6
+       movdqu  (%ebx,%edi,1),%xmm7
+       pxor    %xmm0,%xmm2
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm3,%xmm4
+       movdqa  %xmm3,16(%esp)
+       pxor    %xmm4,%xmm5
+       movdqa  %xmm4,32(%esp)
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm5,48(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm6,64(%esp)
+       movdqa  %xmm7,80(%esp)
+       movups  -48(%edx,%ecx,1),%xmm0
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movdqu  64(%esi),%xmm6
+       movdqu  80(%esi),%xmm7
+       leal    96(%esi),%esi
+       pxor    %xmm2,%xmm1
+       pxor    %xmm0,%xmm2
+       pxor    %xmm3,%xmm1
+       pxor    %xmm0,%xmm3
+       pxor    %xmm4,%xmm1
+       pxor    %xmm0,%xmm4
+       pxor    %xmm5,%xmm1
+       pxor    %xmm0,%xmm5
+       pxor    %xmm6,%xmm1
+       pxor    %xmm0,%xmm6
+       pxor    %xmm7,%xmm1
+       pxor    %xmm0,%xmm7
+       movdqa  %xmm1,96(%esp)
+       movups  -32(%edx,%ecx,1),%xmm1
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       pxor    80(%esp),%xmm7
+       movups  -16(%edx,%ecx,1),%xmm0
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+.byte  102,15,56,220,241
+.byte  102,15,56,220,249
+       movl    120(%esp),%edi
+       movl    124(%esp),%eax
+       call    .L_aesni_encrypt6_enter
+       movdqa  80(%esp),%xmm0
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       pxor    %xmm0,%xmm7
+       movdqa  96(%esp),%xmm1
+       movdqu  %xmm2,-96(%edi,%esi,1)
+       movdqu  %xmm3,-80(%edi,%esi,1)
+       movdqu  %xmm4,-64(%edi,%esi,1)
+       movdqu  %xmm5,-48(%edi,%esi,1)
+       movdqu  %xmm6,-32(%edi,%esi,1)
+       movdqu  %xmm7,-16(%edi,%esi,1)
+       cmpl    %eax,%esi
+       jb      .L077grandloop
+.L076short:
+       addl    $96,%eax
+       subl    %esi,%eax
+       jz      .L078done
+       cmpl    $32,%eax
+       jb      .L079one
+       je      .L080two
+       cmpl    $64,%eax
+       jb      .L081three
+       je      .L082four
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       shll    $4,%ecx
+       shll    $4,%eax
+       movdqu  (%ebx),%xmm2
+       movdqu  (%ebx,%ecx,1),%xmm3
+       movl    116(%esp),%ecx
+       movdqa  %xmm2,%xmm4
+       movdqu  (%ebx,%eax,1),%xmm5
+       movdqa  %xmm2,%xmm6
+       pxor    %xmm0,%xmm2
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm3,%xmm4
+       movdqa  %xmm3,16(%esp)
+       pxor    %xmm4,%xmm5
+       movdqa  %xmm4,32(%esp)
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm5,48(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm6,64(%esp)
+       movups  -48(%edx,%ecx,1),%xmm0
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movdqu  64(%esi),%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm2,%xmm1
+       pxor    %xmm0,%xmm2
+       pxor    %xmm3,%xmm1
+       pxor    %xmm0,%xmm3
+       pxor    %xmm4,%xmm1
+       pxor    %xmm0,%xmm4
+       pxor    %xmm5,%xmm1
+       pxor    %xmm0,%xmm5
+       pxor    %xmm6,%xmm1
+       pxor    %xmm0,%xmm6
+       movdqa  %xmm1,96(%esp)
+       movups  -32(%edx,%ecx,1),%xmm1
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       movups  -16(%edx,%ecx,1),%xmm0
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+.byte  102,15,56,220,241
+.byte  102,15,56,220,249
+       movl    120(%esp),%edi
+       call    .L_aesni_encrypt6_enter
+       movdqa  64(%esp),%xmm0
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    %xmm0,%xmm6
+       movdqa  96(%esp),%xmm1
+       movdqu  %xmm2,(%edi,%esi,1)
+       movdqu  %xmm3,16(%edi,%esi,1)
+       movdqu  %xmm4,32(%edi,%esi,1)
+       movdqu  %xmm5,48(%edi,%esi,1)
+       movdqu  %xmm6,64(%edi,%esi,1)
+       jmp     .L078done
+.align 16
+.L079one:
+       movdqu  (%ebx),%xmm7
+       movl    112(%esp),%edx
+       movdqu  (%esi),%xmm2
+       movl    240(%edx),%ecx
+       pxor    %xmm0,%xmm7
+       pxor    %xmm2,%xmm1
+       pxor    %xmm7,%xmm2
+       movdqa  %xmm1,%xmm6
+       movl    120(%esp),%edi
+       movups  (%edx),%xmm0
+       movups  16(%edx),%xmm1
+       leal    32(%edx),%edx
+       xorps   %xmm0,%xmm2
+.L083enc1_loop_16:
+.byte  102,15,56,220,209
+       decl    %ecx
+       movups  (%edx),%xmm1
+       leal    16(%edx),%edx
+       jnz     .L083enc1_loop_16
+.byte  102,15,56,221,209
+       xorps   %xmm7,%xmm2
+       movdqa  %xmm7,%xmm0
+       movdqa  %xmm6,%xmm1
+       movups  %xmm2,(%edi,%esi,1)
+       jmp     .L078done
+.align 16
+.L080two:
+       leal    1(%ebp),%ecx
+       movl    112(%esp),%edx
+       bsfl    %ecx,%ecx
+       shll    $4,%ecx
+       movdqu  (%ebx),%xmm6
+       movdqu  (%ebx,%ecx,1),%xmm7
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movl    240(%edx),%ecx
+       pxor    %xmm0,%xmm6
+       pxor    %xmm6,%xmm7
+       pxor    %xmm2,%xmm1
+       pxor    %xmm6,%xmm2
+       pxor    %xmm3,%xmm1
+       pxor    %xmm7,%xmm3
+       movdqa  %xmm1,%xmm5
+       movl    120(%esp),%edi
+       call    __aesni_encrypt2
+       xorps   %xmm6,%xmm2
+       xorps   %xmm7,%xmm3
+       movdqa  %xmm7,%xmm0
+       movdqa  %xmm5,%xmm1
+       movups  %xmm2,(%edi,%esi,1)
+       movups  %xmm3,16(%edi,%esi,1)
+       jmp     .L078done
+.align 16
+.L081three:
+       leal    1(%ebp),%ecx
+       movl    112(%esp),%edx
+       bsfl    %ecx,%ecx
+       shll    $4,%ecx
+       movdqu  (%ebx),%xmm5
+       movdqu  (%ebx,%ecx,1),%xmm6
+       movdqa  %xmm5,%xmm7
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movl    240(%edx),%ecx
+       pxor    %xmm0,%xmm5
+       pxor    %xmm5,%xmm6
+       pxor    %xmm6,%xmm7
+       pxor    %xmm2,%xmm1
+       pxor    %xmm5,%xmm2
+       pxor    %xmm3,%xmm1
+       pxor    %xmm6,%xmm3
+       pxor    %xmm4,%xmm1
+       pxor    %xmm7,%xmm4
+       movdqa  %xmm1,96(%esp)
+       movl    120(%esp),%edi
+       call    __aesni_encrypt3
+       xorps   %xmm5,%xmm2
+       xorps   %xmm6,%xmm3
+       xorps   %xmm7,%xmm4
+       movdqa  %xmm7,%xmm0
+       movdqa  96(%esp),%xmm1
+       movups  %xmm2,(%edi,%esi,1)
+       movups  %xmm3,16(%edi,%esi,1)
+       movups  %xmm4,32(%edi,%esi,1)
+       jmp     .L078done
+.align 16
+.L082four:
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       movl    112(%esp),%edx
+       shll    $4,%ecx
+       shll    $4,%eax
+       movdqu  (%ebx),%xmm4
+       movdqu  (%ebx,%ecx,1),%xmm5
+       movdqa  %xmm4,%xmm6
+       movdqu  (%ebx,%eax,1),%xmm7
+       pxor    %xmm0,%xmm4
+       movdqu  (%esi),%xmm2
+       pxor    %xmm4,%xmm5
+       movdqu  16(%esi),%xmm3
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm4,(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm5,16(%esp)
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movl    240(%edx),%ecx
+       pxor    %xmm2,%xmm1
+       pxor    (%esp),%xmm2
+       pxor    %xmm3,%xmm1
+       pxor    16(%esp),%xmm3
+       pxor    %xmm4,%xmm1
+       pxor    %xmm6,%xmm4
+       pxor    %xmm5,%xmm1
+       pxor    %xmm7,%xmm5
+       movdqa  %xmm1,96(%esp)
+       movl    120(%esp),%edi
+       call    __aesni_encrypt4
+       xorps   (%esp),%xmm2
+       xorps   16(%esp),%xmm3
+       xorps   %xmm6,%xmm4
+       movups  %xmm2,(%edi,%esi,1)
+       xorps   %xmm7,%xmm5
+       movups  %xmm3,16(%edi,%esi,1)
+       movdqa  %xmm7,%xmm0
+       movups  %xmm4,32(%edi,%esi,1)
+       movdqa  96(%esp),%xmm1
+       movups  %xmm5,48(%edi,%esi,1)
+.L078done:
+       movl    128(%esp),%edx
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm2,16(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm2,32(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm2,48(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm2,64(%esp)
+       movdqa  %xmm2,80(%esp)
+       movdqa  %xmm2,96(%esp)
+       leal    (%edx),%esp
+       movl    40(%esp),%ecx
+       movl    48(%esp),%ebx
+       movdqu  %xmm0,(%ecx)
+       pxor    %xmm0,%xmm0
+       movdqu  %xmm1,(%ebx)
+       pxor    %xmm1,%xmm1
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.globl _aesni_ocb_decrypt
+.def   _aesni_ocb_decrypt;     .scl    2;      .type   32;     .endef
+.align 16
+_aesni_ocb_decrypt:
+.L_aesni_ocb_decrypt_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       movl    40(%esp),%ecx
+       movl    48(%esp),%ebx
+       movl    20(%esp),%esi
+       movl    24(%esp),%edi
+       movl    28(%esp),%eax
+       movl    32(%esp),%edx
+       movdqu  (%ecx),%xmm0
+       movl    36(%esp),%ebp
+       movdqu  (%ebx),%xmm1
+       movl    44(%esp),%ebx
+       movl    %esp,%ecx
+       subl    $132,%esp
+       andl    $-16,%esp
+       subl    %esi,%edi
+       shll    $4,%eax
+       leal    -96(%esi,%eax,1),%eax
+       movl    %edi,120(%esp)
+       movl    %eax,124(%esp)
+       movl    %ecx,128(%esp)
+       movl    240(%edx),%ecx
+       testl   $1,%ebp
+       jnz     .L084odd
+       bsfl    %ebp,%eax
+       addl    $1,%ebp
+       shll    $4,%eax
+       movdqu  (%ebx,%eax,1),%xmm7
+       movl    %edx,%eax
+       movdqu  (%esi),%xmm2
+       leal    16(%esi),%esi
+       pxor    %xmm0,%xmm7
+       pxor    %xmm7,%xmm2
+       movdqa  %xmm1,%xmm6
+       movups  (%edx),%xmm0
+       movups  16(%edx),%xmm1
+       leal    32(%edx),%edx
+       xorps   %xmm0,%xmm2
+.L085dec1_loop_17:
+.byte  102,15,56,222,209
+       decl    %ecx
+       movups  (%edx),%xmm1
+       leal    16(%edx),%edx
+       jnz     .L085dec1_loop_17
+.byte  102,15,56,223,209
+       xorps   %xmm7,%xmm2
+       movaps  %xmm6,%xmm1
+       movdqa  %xmm7,%xmm0
+       xorps   %xmm2,%xmm1
+       movups  %xmm2,-16(%edi,%esi,1)
+       movl    240(%eax),%ecx
+       movl    %eax,%edx
+       movl    124(%esp),%eax
+.L084odd:
+       shll    $4,%ecx
+       movl    $16,%edi
+       subl    %ecx,%edi
+       movl    %edx,112(%esp)
+       leal    32(%edx,%ecx,1),%edx
+       movl    %edi,116(%esp)
+       cmpl    %eax,%esi
+       ja      .L086short
+       jmp     .L087grandloop
+.align 32
+.L087grandloop:
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       leal    5(%ebp),%edi
+       addl    $6,%ebp
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       bsfl    %edi,%edi
+       shll    $4,%ecx
+       shll    $4,%eax
+       shll    $4,%edi
+       movdqu  (%ebx),%xmm2
+       movdqu  (%ebx,%ecx,1),%xmm3
+       movl    116(%esp),%ecx
+       movdqa  %xmm2,%xmm4
+       movdqu  (%ebx,%eax,1),%xmm5
+       movdqa  %xmm2,%xmm6
+       movdqu  (%ebx,%edi,1),%xmm7
+       pxor    %xmm0,%xmm2
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm3,%xmm4
+       movdqa  %xmm3,16(%esp)
+       pxor    %xmm4,%xmm5
+       movdqa  %xmm4,32(%esp)
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm5,48(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm6,64(%esp)
+       movdqa  %xmm7,80(%esp)
+       movups  -48(%edx,%ecx,1),%xmm0
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movdqu  64(%esi),%xmm6
+       movdqu  80(%esi),%xmm7
+       leal    96(%esi),%esi
+       movdqa  %xmm1,96(%esp)
+       pxor    %xmm0,%xmm2
+       pxor    %xmm0,%xmm3
+       pxor    %xmm0,%xmm4
+       pxor    %xmm0,%xmm5
+       pxor    %xmm0,%xmm6
+       pxor    %xmm0,%xmm7
+       movups  -32(%edx,%ecx,1),%xmm1
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       pxor    80(%esp),%xmm7
+       movups  -16(%edx,%ecx,1),%xmm0
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+.byte  102,15,56,222,241
+.byte  102,15,56,222,249
+       movl    120(%esp),%edi
+       movl    124(%esp),%eax
+       call    .L_aesni_decrypt6_enter
+       movdqa  80(%esp),%xmm0
+       pxor    (%esp),%xmm2
+       movdqa  96(%esp),%xmm1
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       pxor    %xmm0,%xmm7
+       pxor    %xmm2,%xmm1
+       movdqu  %xmm2,-96(%edi,%esi,1)
+       pxor    %xmm3,%xmm1
+       movdqu  %xmm3,-80(%edi,%esi,1)
+       pxor    %xmm4,%xmm1
+       movdqu  %xmm4,-64(%edi,%esi,1)
+       pxor    %xmm5,%xmm1
+       movdqu  %xmm5,-48(%edi,%esi,1)
+       pxor    %xmm6,%xmm1
+       movdqu  %xmm6,-32(%edi,%esi,1)
+       pxor    %xmm7,%xmm1
+       movdqu  %xmm7,-16(%edi,%esi,1)
+       cmpl    %eax,%esi
+       jb      .L087grandloop
+.L086short:
+       addl    $96,%eax
+       subl    %esi,%eax
+       jz      .L088done
+       cmpl    $32,%eax
+       jb      .L089one
+       je      .L090two
+       cmpl    $64,%eax
+       jb      .L091three
+       je      .L092four
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       shll    $4,%ecx
+       shll    $4,%eax
+       movdqu  (%ebx),%xmm2
+       movdqu  (%ebx,%ecx,1),%xmm3
+       movl    116(%esp),%ecx
+       movdqa  %xmm2,%xmm4
+       movdqu  (%ebx,%eax,1),%xmm5
+       movdqa  %xmm2,%xmm6
+       pxor    %xmm0,%xmm2
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm3,%xmm4
+       movdqa  %xmm3,16(%esp)
+       pxor    %xmm4,%xmm5
+       movdqa  %xmm4,32(%esp)
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm5,48(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm6,64(%esp)
+       movups  -48(%edx,%ecx,1),%xmm0
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movdqu  64(%esi),%xmm6
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm1,96(%esp)
+       pxor    %xmm0,%xmm2
+       pxor    %xmm0,%xmm3
+       pxor    %xmm0,%xmm4
+       pxor    %xmm0,%xmm5
+       pxor    %xmm0,%xmm6
+       movups  -32(%edx,%ecx,1),%xmm1
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       movups  -16(%edx,%ecx,1),%xmm0
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+.byte  102,15,56,222,241
+.byte  102,15,56,222,249
+       movl    120(%esp),%edi
+       call    .L_aesni_decrypt6_enter
+       movdqa  64(%esp),%xmm0
+       pxor    (%esp),%xmm2
+       movdqa  96(%esp),%xmm1
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    %xmm0,%xmm6
+       pxor    %xmm2,%xmm1
+       movdqu  %xmm2,(%edi,%esi,1)
+       pxor    %xmm3,%xmm1
+       movdqu  %xmm3,16(%edi,%esi,1)
+       pxor    %xmm4,%xmm1
+       movdqu  %xmm4,32(%edi,%esi,1)
+       pxor    %xmm5,%xmm1
+       movdqu  %xmm5,48(%edi,%esi,1)
+       pxor    %xmm6,%xmm1
+       movdqu  %xmm6,64(%edi,%esi,1)
+       jmp     .L088done
+.align 16
+.L089one:
+       movdqu  (%ebx),%xmm7
+       movl    112(%esp),%edx
+       movdqu  (%esi),%xmm2
+       movl    240(%edx),%ecx
+       pxor    %xmm0,%xmm7
+       pxor    %xmm7,%xmm2
+       movdqa  %xmm1,%xmm6
+       movl    120(%esp),%edi
+       movups  (%edx),%xmm0
+       movups  16(%edx),%xmm1
+       leal    32(%edx),%edx
+       xorps   %xmm0,%xmm2
+.L093dec1_loop_18:
+.byte  102,15,56,222,209
+       decl    %ecx
+       movups  (%edx),%xmm1
+       leal    16(%edx),%edx
+       jnz     .L093dec1_loop_18
+.byte  102,15,56,223,209
+       xorps   %xmm7,%xmm2
+       movaps  %xmm6,%xmm1
+       movdqa  %xmm7,%xmm0
+       xorps   %xmm2,%xmm1
+       movups  %xmm2,(%edi,%esi,1)
+       jmp     .L088done
+.align 16
+.L090two:
+       leal    1(%ebp),%ecx
+       movl    112(%esp),%edx
+       bsfl    %ecx,%ecx
+       shll    $4,%ecx
+       movdqu  (%ebx),%xmm6
+       movdqu  (%ebx,%ecx,1),%xmm7
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movl    240(%edx),%ecx
+       movdqa  %xmm1,%xmm5
+       pxor    %xmm0,%xmm6
+       pxor    %xmm6,%xmm7
+       pxor    %xmm6,%xmm2
+       pxor    %xmm7,%xmm3
+       movl    120(%esp),%edi
+       call    __aesni_decrypt2
+       xorps   %xmm6,%xmm2
+       xorps   %xmm7,%xmm3
+       movdqa  %xmm7,%xmm0
+       xorps   %xmm2,%xmm5
+       movups  %xmm2,(%edi,%esi,1)
+       xorps   %xmm3,%xmm5
+       movups  %xmm3,16(%edi,%esi,1)
+       movaps  %xmm5,%xmm1
+       jmp     .L088done
+.align 16
+.L091three:
+       leal    1(%ebp),%ecx
+       movl    112(%esp),%edx
+       bsfl    %ecx,%ecx
+       shll    $4,%ecx
+       movdqu  (%ebx),%xmm5
+       movdqu  (%ebx,%ecx,1),%xmm6
+       movdqa  %xmm5,%xmm7
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movl    240(%edx),%ecx
+       movdqa  %xmm1,96(%esp)
+       pxor    %xmm0,%xmm5
+       pxor    %xmm5,%xmm6
+       pxor    %xmm6,%xmm7
+       pxor    %xmm5,%xmm2
+       pxor    %xmm6,%xmm3
+       pxor    %xmm7,%xmm4
+       movl    120(%esp),%edi
+       call    __aesni_decrypt3
+       movdqa  96(%esp),%xmm1
+       xorps   %xmm5,%xmm2
+       xorps   %xmm6,%xmm3
+       xorps   %xmm7,%xmm4
+       movups  %xmm2,(%edi,%esi,1)
+       pxor    %xmm2,%xmm1
+       movdqa  %xmm7,%xmm0
+       movups  %xmm3,16(%edi,%esi,1)
+       pxor    %xmm3,%xmm1
+       movups  %xmm4,32(%edi,%esi,1)
+       pxor    %xmm4,%xmm1
+       jmp     .L088done
+.align 16
+.L092four:
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       movl    112(%esp),%edx
+       shll    $4,%ecx
+       shll    $4,%eax
+       movdqu  (%ebx),%xmm4
+       movdqu  (%ebx,%ecx,1),%xmm5
+       movdqa  %xmm4,%xmm6
+       movdqu  (%ebx,%eax,1),%xmm7
+       pxor    %xmm0,%xmm4
+       movdqu  (%esi),%xmm2
+       pxor    %xmm4,%xmm5
+       movdqu  16(%esi),%xmm3
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm4,(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm5,16(%esp)
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movl    240(%edx),%ecx
+       movdqa  %xmm1,96(%esp)
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    %xmm6,%xmm4
+       pxor    %xmm7,%xmm5
+       movl    120(%esp),%edi
+       call    __aesni_decrypt4
+       movdqa  96(%esp),%xmm1
+       xorps   (%esp),%xmm2
+       xorps   16(%esp),%xmm3
+       xorps   %xmm6,%xmm4
+       movups  %xmm2,(%edi,%esi,1)
+       pxor    %xmm2,%xmm1
+       xorps   %xmm7,%xmm5
+       movups  %xmm3,16(%edi,%esi,1)
+       pxor    %xmm3,%xmm1
+       movdqa  %xmm7,%xmm0
+       movups  %xmm4,32(%edi,%esi,1)
+       pxor    %xmm4,%xmm1
+       movups  %xmm5,48(%edi,%esi,1)
+       pxor    %xmm5,%xmm1
+.L088done:
+       movl    128(%esp),%edx
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm2,16(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm2,32(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm2,48(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm2,64(%esp)
+       movdqa  %xmm2,80(%esp)
+       movdqa  %xmm2,96(%esp)
+       leal    (%edx),%esp
+       movl    40(%esp),%ecx
+       movl    48(%esp),%ebx
+       movdqu  %xmm0,(%ecx)
+       pxor    %xmm0,%xmm0
+       movdqu  %xmm1,(%ebx)
+       pxor    %xmm1,%xmm1
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.globl _aesni_cbc_encrypt
+.def   _aesni_cbc_encrypt;     .scl    2;      .type   32;     .endef
+.align 16
+_aesni_cbc_encrypt:
+.L_aesni_cbc_encrypt_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       movl    20(%esp),%esi
+       movl    %esp,%ebx
+       movl    24(%esp),%edi
+       subl    $24,%ebx
+       movl    28(%esp),%eax
+       andl    $-16,%ebx
+       movl    32(%esp),%edx
+       movl    36(%esp),%ebp
+       testl   %eax,%eax
+       jz      .L094cbc_abort
+       cmpl    $0,40(%esp)
+       xchgl   %esp,%ebx
+       movups  (%ebp),%xmm7
+       movl    240(%edx),%ecx
+       movl    %edx,%ebp
+       movl    %ebx,16(%esp)
+       movl    %ecx,%ebx
+       je      .L095cbc_decrypt
+       movaps  %xmm7,%xmm2
+       cmpl    $16,%eax
+       jb      .L096cbc_enc_tail
+       subl    $16,%eax
+       jmp     .L097cbc_enc_loop
+.align 16
+.L097cbc_enc_loop:
+       movups  (%esi),%xmm7
+       leal    16(%esi),%esi
+       movups  (%edx),%xmm0
+       movups  16(%edx),%xmm1
+       xorps   %xmm0,%xmm7
+       leal    32(%edx),%edx
+       xorps   %xmm7,%xmm2
+.L098enc1_loop_19:
+.byte  102,15,56,220,209
+       decl    %ecx
+       movups  (%edx),%xmm1
+       leal    16(%edx),%edx
+       jnz     .L098enc1_loop_19
+.byte  102,15,56,221,209
+       movl    %ebx,%ecx
+       movl    %ebp,%edx
+       movups  %xmm2,(%edi)
        leal    16(%edi),%edi
-.L077cbc_dec_loop6_enter:
+       subl    $16,%eax
+       jnc     .L097cbc_enc_loop
+       addl    $16,%eax
+       jnz     .L096cbc_enc_tail
+       movaps  %xmm2,%xmm7
+       pxor    %xmm2,%xmm2
+       jmp     .L099cbc_ret
+.L096cbc_enc_tail:
+       movl    %eax,%ecx
+.long  2767451785
+       movl    $16,%ecx
+       subl    %eax,%ecx
+       xorl    %eax,%eax
+.long  2868115081
+       leal    -16(%edi),%edi
+       movl    %ebx,%ecx
+       movl    %edi,%esi
+       movl    %ebp,%edx
+       jmp     .L097cbc_enc_loop
+.align 16
+.L095cbc_decrypt:
+       cmpl    $80,%eax
+       jbe     .L100cbc_dec_tail
+       movaps  %xmm7,(%esp)
+       subl    $80,%eax
+       jmp     .L101cbc_dec_loop6_enter
+.align 16
+.L102cbc_dec_loop6:
+       movaps  %xmm0,(%esp)
+       movups  %xmm7,(%edi)
+       leal    16(%edi),%edi
+.L101cbc_dec_loop6_enter:
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -1808,28 +2713,28 @@ _aesni_cbc_encrypt:
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
        subl    $96,%eax
-       ja      .L078cbc_dec_loop6
+       ja      .L102cbc_dec_loop6
        movaps  %xmm7,%xmm2
        movaps  %xmm0,%xmm7
        addl    $80,%eax
-       jle     .L079cbc_dec_tail_collected
+       jle     .L103cbc_dec_clear_tail_collected
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
-.L076cbc_dec_tail:
+.L100cbc_dec_tail:
        movups  (%esi),%xmm2
        movaps  %xmm2,%xmm6
        cmpl    $16,%eax
-       jbe     .L080cbc_dec_one
+       jbe     .L104cbc_dec_one
        movups  16(%esi),%xmm3
        movaps  %xmm3,%xmm5
        cmpl    $32,%eax
-       jbe     .L081cbc_dec_two
+       jbe     .L105cbc_dec_two
        movups  32(%esi),%xmm4
        cmpl    $48,%eax
-       jbe     .L082cbc_dec_three
+       jbe     .L106cbc_dec_three
        movups  48(%esi),%xmm5
        cmpl    $64,%eax
-       jbe     .L083cbc_dec_four
+       jbe     .L107cbc_dec_four
        movups  64(%esi),%xmm6
        movaps  %xmm7,(%esp)
        movups  (%esi),%xmm2
@@ -1847,56 +2752,62 @@ _aesni_cbc_encrypt:
        xorps   %xmm0,%xmm6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%edi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%edi)
+       pxor    %xmm5,%xmm5
        leal    64(%edi),%edi
        movaps  %xmm6,%xmm2
+       pxor    %xmm6,%xmm6
        subl    $80,%eax
-       jmp     .L079cbc_dec_tail_collected
+       jmp     .L108cbc_dec_tail_collected
 .align 16
-.L080cbc_dec_one:
+.L104cbc_dec_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L084dec1_loop_16:
+.L109dec1_loop_20:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L084dec1_loop_16
+       jnz     .L109dec1_loop_20
 .byte  102,15,56,223,209
        xorps   %xmm7,%xmm2
        movaps  %xmm6,%xmm7
        subl    $16,%eax
-       jmp     .L079cbc_dec_tail_collected
+       jmp     .L108cbc_dec_tail_collected
 .align 16
-.L081cbc_dec_two:
-       xorps   %xmm4,%xmm4
-       call    __aesni_decrypt3
+.L105cbc_dec_two:
+       call    __aesni_decrypt2
        xorps   %xmm7,%xmm2
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movaps  %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
        leal    16(%edi),%edi
        movaps  %xmm5,%xmm7
        subl    $32,%eax
-       jmp     .L079cbc_dec_tail_collected
+       jmp     .L108cbc_dec_tail_collected
 .align 16
-.L082cbc_dec_three:
+.L106cbc_dec_three:
        call    __aesni_decrypt3
        xorps   %xmm7,%xmm2
        xorps   %xmm6,%xmm3
        xorps   %xmm5,%xmm4
        movups  %xmm2,(%edi)
        movaps  %xmm4,%xmm2
+       pxor    %xmm4,%xmm4
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        leal    32(%edi),%edi
        movups  32(%esi),%xmm7
        subl    $48,%eax
-       jmp     .L079cbc_dec_tail_collected
+       jmp     .L108cbc_dec_tail_collected
 .align 16
-.L083cbc_dec_four:
+.L107cbc_dec_four:
        call    __aesni_decrypt4
        movups  16(%esi),%xmm1
        movups  32(%esi),%xmm0
@@ -1906,28 +2817,44 @@ _aesni_cbc_encrypt:
        movups  %xmm2,(%edi)
        xorps   %xmm1,%xmm4
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        xorps   %xmm0,%xmm5
        movups  %xmm4,32(%edi)
+       pxor    %xmm4,%xmm4
        leal    48(%edi),%edi
        movaps  %xmm5,%xmm2
+       pxor    %xmm5,%xmm5
        subl    $64,%eax
-.L079cbc_dec_tail_collected:
+       jmp     .L108cbc_dec_tail_collected
+.align 16
+.L103cbc_dec_clear_tail_collected:
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+.L108cbc_dec_tail_collected:
        andl    $15,%eax
-       jnz     .L085cbc_dec_tail_partial
+       jnz     .L110cbc_dec_tail_partial
        movups  %xmm2,(%edi)
-       jmp     .L075cbc_ret
+       pxor    %xmm0,%xmm0
+       jmp     .L099cbc_ret
 .align 16
-.L085cbc_dec_tail_partial:
+.L110cbc_dec_tail_partial:
        movaps  %xmm2,(%esp)
+       pxor    %xmm0,%xmm0
        movl    $16,%ecx
        movl    %esp,%esi
        subl    %eax,%ecx
 .long  2767451785
-.L075cbc_ret:
+       movdqa  %xmm2,(%esp)
+.L099cbc_ret:
        movl    16(%esp),%esp
        movl    36(%esp),%ebp
+       pxor    %xmm2,%xmm2
+       pxor    %xmm1,%xmm1
        movups  %xmm7,(%ebp)
-.L070cbc_abort:
+       pxor    %xmm7,%xmm7
+.L094cbc_abort:
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -1936,52 +2863,62 @@ _aesni_cbc_encrypt:
 .def   __aesni_set_encrypt_key;        .scl    3;      .type   32;     .endef
 .align 16
 __aesni_set_encrypt_key:
+       pushl   %ebp
+       pushl   %ebx
        testl   %eax,%eax
-       jz      .L086bad_pointer
+       jz      .L111bad_pointer
        testl   %edx,%edx
-       jz      .L086bad_pointer
+       jz      .L111bad_pointer
+       call    .L112pic
+.L112pic:
+       popl    %ebx
+       leal    .Lkey_const-.L112pic(%ebx),%ebx
+       leal    __gnutls_x86_cpuid_s,%ebp
        movups  (%eax),%xmm0
        xorps   %xmm4,%xmm4
+       movl    4(%ebp),%ebp
        leal    16(%edx),%edx
+       andl    $268437504,%ebp
        cmpl    $256,%ecx
-       je      .L08714rounds
+       je      .L11314rounds
        cmpl    $192,%ecx
-       je      .L08812rounds
+       je      .L11412rounds
        cmpl    $128,%ecx
-       jne     .L089bad_keybits
+       jne     .L115bad_keybits
 .align 16
-.L09010rounds:
+.L11610rounds:
+       cmpl    $268435456,%ebp
+       je      .L11710rounds_alt
        movl    $9,%ecx
        movups  %xmm0,-16(%edx)
 .byte  102,15,58,223,200,1
-       call    .L091key_128_cold
+       call    .L118key_128_cold
 .byte  102,15,58,223,200,2
-       call    .L092key_128
+       call    .L119key_128
 .byte  102,15,58,223,200,4
-       call    .L092key_128
+       call    .L119key_128
 .byte  102,15,58,223,200,8
-       call    .L092key_128
+       call    .L119key_128
 .byte  102,15,58,223,200,16
-       call    .L092key_128
+       call    .L119key_128
 .byte  102,15,58,223,200,32
-       call    .L092key_128
+       call    .L119key_128
 .byte  102,15,58,223,200,64
-       call    .L092key_128
+       call    .L119key_128
 .byte  102,15,58,223,200,128
-       call    .L092key_128
+       call    .L119key_128
 .byte  102,15,58,223,200,27
-       call    .L092key_128
+       call    .L119key_128
 .byte  102,15,58,223,200,54
-       call    .L092key_128
+       call    .L119key_128
        movups  %xmm0,(%edx)
        movl    %ecx,80(%edx)
-       xorl    %eax,%eax
-       ret
+       jmp     .L120good_key
 .align 16
-.L092key_128:
+.L119key_128:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
-.L091key_128_cold:
+.L118key_128_cold:
        shufps  $16,%xmm0,%xmm4
        xorps   %xmm4,%xmm0
        shufps  $140,%xmm0,%xmm4
@@ -1990,38 +2927,91 @@ __aesni_set_encrypt_key:
        xorps   %xmm1,%xmm0
        ret
 .align 16
-.L08812rounds:
+.L11710rounds_alt:
+       movdqa  (%ebx),%xmm5
+       movl    $8,%ecx
+       movdqa  32(%ebx),%xmm4
+       movdqa  %xmm0,%xmm2
+       movdqu  %xmm0,-16(%edx)
+.L121loop_key128:
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       leal    16(%edx),%edx
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,-16(%edx)
+       movdqa  %xmm0,%xmm2
+       decl    %ecx
+       jnz     .L121loop_key128
+       movdqa  48(%ebx),%xmm4
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%edx)
+       movdqa  %xmm0,%xmm2
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,16(%edx)
+       movl    $9,%ecx
+       movl    %ecx,96(%edx)
+       jmp     .L120good_key
+.align 16
+.L11412rounds:
        movq    16(%eax),%xmm2
+       cmpl    $268435456,%ebp
+       je      .L12212rounds_alt
        movl    $11,%ecx
        movups  %xmm0,-16(%edx)
 .byte  102,15,58,223,202,1
-       call    .L093key_192a_cold
+       call    .L123key_192a_cold
 .byte  102,15,58,223,202,2
-       call    .L094key_192b
+       call    .L124key_192b
 .byte  102,15,58,223,202,4
-       call    .L095key_192a
+       call    .L125key_192a
 .byte  102,15,58,223,202,8
-       call    .L094key_192b
+       call    .L124key_192b
 .byte  102,15,58,223,202,16
-       call    .L095key_192a
+       call    .L125key_192a
 .byte  102,15,58,223,202,32
-       call    .L094key_192b
+       call    .L124key_192b
 .byte  102,15,58,223,202,64
-       call    .L095key_192a
+       call    .L125key_192a
 .byte  102,15,58,223,202,128
-       call    .L094key_192b
+       call    .L124key_192b
        movups  %xmm0,(%edx)
        movl    %ecx,48(%edx)
-       xorl    %eax,%eax
-       ret
+       jmp     .L120good_key
 .align 16
-.L095key_192a:
+.L125key_192a:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
 .align 16
-.L093key_192a_cold:
+.L123key_192a_cold:
        movaps  %xmm2,%xmm5
-.L096key_192b_warm:
+.L126key_192b_warm:
        shufps  $16,%xmm0,%xmm4
        movdqa  %xmm2,%xmm3
        xorps   %xmm4,%xmm0
@@ -2035,56 +3025,90 @@ __aesni_set_encrypt_key:
        pxor    %xmm3,%xmm2
        ret
 .align 16
-.L094key_192b:
+.L124key_192b:
        movaps  %xmm0,%xmm3
        shufps  $68,%xmm0,%xmm5
        movups  %xmm5,(%edx)
        shufps  $78,%xmm2,%xmm3
        movups  %xmm3,16(%edx)
        leal    32(%edx),%edx
-       jmp     .L096key_192b_warm
+       jmp     .L126key_192b_warm
+.align 16
+.L12212rounds_alt:
+       movdqa  16(%ebx),%xmm5
+       movdqa  32(%ebx),%xmm4
+       movl    $8,%ecx
+       movdqu  %xmm0,-16(%edx)
+.L127loop_key192:
+       movq    %xmm2,(%edx)
+       movdqa  %xmm2,%xmm1
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       pslld   $1,%xmm4
+       leal    24(%edx),%edx
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pshufd  $255,%xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pxor    %xmm2,%xmm0
+       pxor    %xmm3,%xmm2
+       movdqu  %xmm0,-16(%edx)
+       decl    %ecx
+       jnz     .L127loop_key192
+       movl    $11,%ecx
+       movl    %ecx,32(%edx)
+       jmp     .L120good_key
 .align 16
-.L08714rounds:
+.L11314rounds:
        movups  16(%eax),%xmm2
-       movl    $13,%ecx
        leal    16(%edx),%edx
+       cmpl    $268435456,%ebp
+       je      .L12814rounds_alt
+       movl    $13,%ecx
        movups  %xmm0,-32(%edx)
        movups  %xmm2,-16(%edx)
 .byte  102,15,58,223,202,1
-       call    .L097key_256a_cold
+       call    .L129key_256a_cold
 .byte  102,15,58,223,200,1
-       call    .L098key_256b
+       call    .L130key_256b
 .byte  102,15,58,223,202,2
-       call    .L099key_256a
+       call    .L131key_256a
 .byte  102,15,58,223,200,2
-       call    .L098key_256b
+       call    .L130key_256b
 .byte  102,15,58,223,202,4
-       call    .L099key_256a
+       call    .L131key_256a
 .byte  102,15,58,223,200,4
-       call    .L098key_256b
+       call    .L130key_256b
 .byte  102,15,58,223,202,8
-       call    .L099key_256a
+       call    .L131key_256a
 .byte  102,15,58,223,200,8
-       call    .L098key_256b
+       call    .L130key_256b
 .byte  102,15,58,223,202,16
-       call    .L099key_256a
+       call    .L131key_256a
 .byte  102,15,58,223,200,16
-       call    .L098key_256b
+       call    .L130key_256b
 .byte  102,15,58,223,202,32
-       call    .L099key_256a
+       call    .L131key_256a
 .byte  102,15,58,223,200,32
-       call    .L098key_256b
+       call    .L130key_256b
 .byte  102,15,58,223,202,64
-       call    .L099key_256a
+       call    .L131key_256a
        movups  %xmm0,(%edx)
        movl    %ecx,16(%edx)
        xorl    %eax,%eax
-       ret
+       jmp     .L120good_key
 .align 16
-.L099key_256a:
+.L131key_256a:
        movups  %xmm2,(%edx)
        leal    16(%edx),%edx
-.L097key_256a_cold:
+.L129key_256a_cold:
        shufps  $16,%xmm0,%xmm4
        xorps   %xmm4,%xmm0
        shufps  $140,%xmm0,%xmm4
@@ -2093,7 +3117,7 @@ __aesni_set_encrypt_key:
        xorps   %xmm1,%xmm0
        ret
 .align 16
-.L098key_256b:
+.L130key_256b:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
        shufps  $16,%xmm2,%xmm4
@@ -2103,13 +3127,70 @@ __aesni_set_encrypt_key:
        shufps  $170,%xmm1,%xmm1
        xorps   %xmm1,%xmm2
        ret
+.align 16
+.L12814rounds_alt:
+       movdqa  (%ebx),%xmm5
+       movdqa  32(%ebx),%xmm4
+       movl    $7,%ecx
+       movdqu  %xmm0,-32(%edx)
+       movdqa  %xmm2,%xmm1
+       movdqu  %xmm2,-16(%edx)
+.L132loop_key256:
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pslld   $1,%xmm4
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%edx)
+       decl    %ecx
+       jz      .L133done_key256
+       pshufd  $255,%xmm0,%xmm2
+       pxor    %xmm3,%xmm3
+.byte  102,15,56,221,211
+       movdqa  %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm3,%xmm1
+       pxor    %xmm1,%xmm2
+       movdqu  %xmm2,16(%edx)
+       leal    32(%edx),%edx
+       movdqa  %xmm2,%xmm1
+       jmp     .L132loop_key256
+.L133done_key256:
+       movl    $13,%ecx
+       movl    %ecx,16(%edx)
+.L120good_key:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       xorl    %eax,%eax
+       popl    %ebx
+       popl    %ebp
+       ret
 .align 4
-.L086bad_pointer:
+.L111bad_pointer:
        movl    $-1,%eax
+       popl    %ebx
+       popl    %ebp
        ret
 .align 4
-.L089bad_keybits:
+.L115bad_keybits:
+       pxor    %xmm0,%xmm0
        movl    $-2,%eax
+       popl    %ebx
+       popl    %ebp
        ret
 .globl _aesni_set_encrypt_key
 .def   _aesni_set_encrypt_key; .scl    2;      .type   32;     .endef
@@ -2133,7 +3214,7 @@ _aesni_set_decrypt_key:
        movl    12(%esp),%edx
        shll    $4,%ecx
        testl   %eax,%eax
-       jnz     .L100dec_key_ret
+       jnz     .L134dec_key_ret
        leal    16(%edx,%ecx,1),%eax
        movups  (%edx),%xmm0
        movups  (%eax),%xmm1
@@ -2141,7 +3222,7 @@ _aesni_set_decrypt_key:
        movups  %xmm1,(%edx)
        leal    16(%edx),%edx
        leal    -16(%eax),%eax
-.L101dec_key_inverse:
+.L135dec_key_inverse:
        movups  (%edx),%xmm0
        movups  (%eax),%xmm1
 .byte  102,15,56,219,192
@@ -2151,15 +3232,24 @@ _aesni_set_decrypt_key:
        movups  %xmm0,16(%eax)
        movups  %xmm1,-16(%edx)
        cmpl    %edx,%eax
-       ja      .L101dec_key_inverse
+       ja      .L135dec_key_inverse
        movups  (%edx),%xmm0
 .byte  102,15,56,219,192
        movups  %xmm0,(%edx)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        xorl    %eax,%eax
-.L100dec_key_ret:
+.L134dec_key_ret:
        ret
+.align 64
+.Lkey_const:
+.long  202313229,202313229,202313229,202313229
+.long  67569157,67569157,67569157,67569157
+.long  1,1,1,1
+.long  27,27,27,27
 .byte  65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
 .byte  83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
 .byte  32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
 .byte  115,108,46,111,114,103,62,0
+.comm  __gnutls_x86_cpuid_s,16
 
index 79ffbf70c7b881e9495407e2a21feacc4b42cfa9..4e8de065f21a54d6c15303155e1fbedcf82cc814 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -43,6 +43,7 @@
 .def   aesni_encrypt;  .scl 2; .type 32;       .endef
 .p2align       4
 aesni_encrypt:
+
        movups  (%rcx),%xmm2
        movl    240(%r8),%eax
        movups  (%r8),%xmm0
@@ -63,10 +64,12 @@ aesni_encrypt:
        .byte   0xf3,0xc3
 
 
+
 .globl aesni_decrypt
 .def   aesni_decrypt;  .scl 2; .type 32;       .endef
 .p2align       4
 aesni_decrypt:
+
        movups  (%rcx),%xmm2
        movl    240(%r8),%eax
        movups  (%r8),%xmm0
@@ -86,9 +89,11 @@ aesni_decrypt:
        pxor    %xmm2,%xmm2
        .byte   0xf3,0xc3
 
+
 .def   _aesni_encrypt2;        .scl 3; .type 32;       .endef
 .p2align       4
 _aesni_encrypt2:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -115,9 +120,11 @@ _aesni_encrypt2:
 .byte  102,15,56,221,216
        .byte   0xf3,0xc3
 
+
 .def   _aesni_decrypt2;        .scl 3; .type 32;       .endef
 .p2align       4
 _aesni_decrypt2:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -144,9 +151,11 @@ _aesni_decrypt2:
 .byte  102,15,56,223,216
        .byte   0xf3,0xc3
 
+
 .def   _aesni_encrypt3;        .scl 3; .type 32;       .endef
 .p2align       4
 _aesni_encrypt3:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -178,9 +187,11 @@ _aesni_encrypt3:
 .byte  102,15,56,221,224
        .byte   0xf3,0xc3
 
+
 .def   _aesni_decrypt3;        .scl 3; .type 32;       .endef
 .p2align       4
 _aesni_decrypt3:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -212,9 +223,11 @@ _aesni_decrypt3:
 .byte  102,15,56,223,224
        .byte   0xf3,0xc3
 
+
 .def   _aesni_encrypt4;        .scl 3; .type 32;       .endef
 .p2align       4
 _aesni_encrypt4:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -252,9 +265,11 @@ _aesni_encrypt4:
 .byte  102,15,56,221,232
        .byte   0xf3,0xc3
 
+
 .def   _aesni_decrypt4;        .scl 3; .type 32;       .endef
 .p2align       4
 _aesni_decrypt4:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -292,9 +307,11 @@ _aesni_decrypt4:
 .byte  102,15,56,223,232
        .byte   0xf3,0xc3
 
+
 .def   _aesni_encrypt6;        .scl 3; .type 32;       .endef
 .p2align       4
 _aesni_encrypt6:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -346,9 +363,11 @@ _aesni_encrypt6:
 .byte  102,15,56,221,248
        .byte   0xf3,0xc3
 
+
 .def   _aesni_decrypt6;        .scl 3; .type 32;       .endef
 .p2align       4
 _aesni_decrypt6:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -400,9 +419,11 @@ _aesni_decrypt6:
 .byte  102,15,56,223,248
        .byte   0xf3,0xc3
 
+
 .def   _aesni_encrypt8;        .scl 3; .type 32;       .endef
 .p2align       4
 _aesni_encrypt8:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -464,9 +485,11 @@ _aesni_encrypt8:
 .byte  102,68,15,56,221,200
        .byte   0xf3,0xc3
 
+
 .def   _aesni_decrypt8;        .scl 3; .type 32;       .endef
 .p2align       4
 _aesni_decrypt8:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -528,6 +551,7 @@ _aesni_decrypt8:
 .byte  102,68,15,56,223,200
        .byte   0xf3,0xc3
 
+
 .globl aesni_ecb_encrypt
 .def   aesni_ecb_encrypt;      .scl 2; .type 32;       .endef
 .p2align       4
@@ -542,6 +566,7 @@ aesni_ecb_encrypt:
        movq    %r9,%rcx
        movq    40(%rsp),%r8
 
+
        leaq    -88(%rsp),%rsp
        movaps  %xmm6,(%rsp)
        movaps  %xmm7,16(%rsp)
@@ -897,6 +922,7 @@ aesni_ecb_encrypt:
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
+
 .LSEH_end_aesni_ecb_encrypt:
 .globl aesni_ccm64_encrypt_blocks
 .def   aesni_ccm64_encrypt_blocks;     .scl 2; .type 32;       .endef
@@ -1130,6 +1156,7 @@ aesni_ctr32_encrypt_blocks:
        movq    %r9,%rcx
        movq    40(%rsp),%r8
 
+
        cmpq    $1,%rdx
        jne     .Lctr32_bulk
 
@@ -1159,22 +1186,23 @@ aesni_ctr32_encrypt_blocks:
 
 .p2align       4
 .Lctr32_bulk:
-       leaq    (%rsp),%rax
+       leaq    (%rsp),%r11
+
        pushq   %rbp
+
        subq    $288,%rsp
        andq    $-16,%rsp
-       movaps  %xmm6,-168(%rax)
-       movaps  %xmm7,-152(%rax)
-       movaps  %xmm8,-136(%rax)
-       movaps  %xmm9,-120(%rax)
-       movaps  %xmm10,-104(%rax)
-       movaps  %xmm11,-88(%rax)
-       movaps  %xmm12,-72(%rax)
-       movaps  %xmm13,-56(%rax)
-       movaps  %xmm14,-40(%rax)
-       movaps  %xmm15,-24(%rax)
+       movaps  %xmm6,-168(%r11)
+       movaps  %xmm7,-152(%r11)
+       movaps  %xmm8,-136(%r11)
+       movaps  %xmm9,-120(%r11)
+       movaps  %xmm10,-104(%r11)
+       movaps  %xmm11,-88(%r11)
+       movaps  %xmm12,-72(%r11)
+       movaps  %xmm13,-56(%r11)
+       movaps  %xmm14,-40(%r11)
+       movaps  %xmm15,-24(%r11)
 .Lctr32_body:
-       leaq    -8(%rax),%rbp
 
 
 
@@ -1183,7 +1211,7 @@ aesni_ctr32_encrypt_blocks:
        movdqu  (%rcx),%xmm0
        movl    12(%r8),%r8d
        pxor    %xmm0,%xmm2
-       movl    12(%rcx),%r11d
+       movl    12(%rcx),%ebp
        movdqa  %xmm2,0(%rsp)
        bswapl  %r8d
        movdqa  %xmm2,%xmm3
@@ -1199,8 +1227,8 @@ aesni_ctr32_encrypt_blocks:
        leaq    2(%r8),%rdx
        bswapl  %eax
        bswapl  %edx
-       xorl    %r11d,%eax
-       xorl    %r11d,%edx
+       xorl    %ebp,%eax
+       xorl    %ebp,%edx
 .byte  102,15,58,34,216,3
        leaq    3(%r8),%rax
        movdqa  %xmm3,16(%rsp)
@@ -1209,25 +1237,25 @@ aesni_ctr32_encrypt_blocks:
        movq    %r10,%rdx
        leaq    4(%r8),%r10
        movdqa  %xmm4,32(%rsp)
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
        bswapl  %r10d
 .byte  102,15,58,34,232,3
-       xorl    %r11d,%r10d
+       xorl    %ebp,%r10d
        movdqa  %xmm5,48(%rsp)
        leaq    5(%r8),%r9
        movl    %r10d,64+12(%rsp)
        bswapl  %r9d
        leaq    6(%r8),%r10
        movl    240(%rcx),%eax
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
        bswapl  %r10d
        movl    %r9d,80+12(%rsp)
-       xorl    %r11d,%r10d
+       xorl    %ebp,%r10d
        leaq    7(%r8),%r9
        movl    %r10d,96+12(%rsp)
        bswapl  %r9d
        movl    _gnutls_x86_cpuid_s+4(%rip),%r10d
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
        andl    $71303168,%r10d
        movl    %r9d,112+12(%rsp)
 
@@ -1251,7 +1279,7 @@ aesni_ctr32_encrypt_blocks:
 .Lctr32_6x:
        shll    $4,%eax
        movl    $48,%r10d
-       bswapl  %r11d
+       bswapl  %ebp
        leaq    32(%rcx,%rax,1),%rcx
        subq    %rax,%r10
        jmp     .Lctr32_loop6
@@ -1262,32 +1290,32 @@ aesni_ctr32_encrypt_blocks:
        movups  -48(%rcx,%r10,1),%xmm0
 .byte  102,15,56,220,209
        movl    %r8d,%eax
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 .byte  102,15,56,220,217
 .byte  0x0f,0x38,0xf1,0x44,0x24,12
        leal    1(%r8),%eax
 .byte  102,15,56,220,225
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 .byte  0x0f,0x38,0xf1,0x44,0x24,28
 .byte  102,15,56,220,233
        leal    2(%r8),%eax
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 .byte  102,15,56,220,241
 .byte  0x0f,0x38,0xf1,0x44,0x24,44
        leal    3(%r8),%eax
 .byte  102,15,56,220,249
        movups  -32(%rcx,%r10,1),%xmm1
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 
 .byte  102,15,56,220,208
 .byte  0x0f,0x38,0xf1,0x44,0x24,60
        leal    4(%r8),%eax
 .byte  102,15,56,220,216
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 .byte  0x0f,0x38,0xf1,0x44,0x24,76
 .byte  102,15,56,220,224
        leal    5(%r8),%eax
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 .byte  102,15,56,220,232
 .byte  0x0f,0x38,0xf1,0x44,0x24,92
        movq    %r10,%rax
@@ -1348,7 +1376,7 @@ aesni_ctr32_encrypt_blocks:
        bswapl  %r9d
        movups  32-128(%rcx),%xmm0
 .byte  102,15,56,220,225
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
        nop
 .byte  102,15,56,220,233
        movl    %r9d,0+12(%rsp)
@@ -1361,7 +1389,7 @@ aesni_ctr32_encrypt_blocks:
        bswapl  %r9d
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
@@ -1375,7 +1403,7 @@ aesni_ctr32_encrypt_blocks:
        bswapl  %r9d
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
@@ -1389,7 +1417,7 @@ aesni_ctr32_encrypt_blocks:
        bswapl  %r9d
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
@@ -1403,7 +1431,7 @@ aesni_ctr32_encrypt_blocks:
        bswapl  %r9d
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
@@ -1417,7 +1445,7 @@ aesni_ctr32_encrypt_blocks:
        bswapl  %r9d
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
@@ -1431,7 +1459,7 @@ aesni_ctr32_encrypt_blocks:
        bswapl  %r9d
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
@@ -1446,7 +1474,7 @@ aesni_ctr32_encrypt_blocks:
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
 .byte  102,15,56,220,224
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
        movdqu  0(%rdi),%xmm10
 .byte  102,15,56,220,232
        movl    %r9d,112+12(%rsp)
@@ -1681,32 +1709,32 @@ aesni_ctr32_encrypt_blocks:
 
 .Lctr32_done:
        xorps   %xmm0,%xmm0
-       xorl    %r11d,%r11d
+       xorl    %ebp,%ebp
        pxor    %xmm1,%xmm1
        pxor    %xmm2,%xmm2
        pxor    %xmm3,%xmm3
        pxor    %xmm4,%xmm4
        pxor    %xmm5,%xmm5
-       movaps  -160(%rbp),%xmm6
-       movaps  %xmm0,-160(%rbp)
-       movaps  -144(%rbp),%xmm7
-       movaps  %xmm0,-144(%rbp)
-       movaps  -128(%rbp),%xmm8
-       movaps  %xmm0,-128(%rbp)
-       movaps  -112(%rbp),%xmm9
-       movaps  %xmm0,-112(%rbp)
-       movaps  -96(%rbp),%xmm10
-       movaps  %xmm0,-96(%rbp)
-       movaps  -80(%rbp),%xmm11
-       movaps  %xmm0,-80(%rbp)
-       movaps  -64(%rbp),%xmm12
-       movaps  %xmm0,-64(%rbp)
-       movaps  -48(%rbp),%xmm13
-       movaps  %xmm0,-48(%rbp)
-       movaps  -32(%rbp),%xmm14
-       movaps  %xmm0,-32(%rbp)
-       movaps  -16(%rbp),%xmm15
-       movaps  %xmm0,-16(%rbp)
+       movaps  -168(%r11),%xmm6
+       movaps  %xmm0,-168(%r11)
+       movaps  -152(%r11),%xmm7
+       movaps  %xmm0,-152(%r11)
+       movaps  -136(%r11),%xmm8
+       movaps  %xmm0,-136(%r11)
+       movaps  -120(%r11),%xmm9
+       movaps  %xmm0,-120(%r11)
+       movaps  -104(%r11),%xmm10
+       movaps  %xmm0,-104(%r11)
+       movaps  -88(%r11),%xmm11
+       movaps  %xmm0,-88(%r11)
+       movaps  -72(%r11),%xmm12
+       movaps  %xmm0,-72(%r11)
+       movaps  -56(%r11),%xmm13
+       movaps  %xmm0,-56(%r11)
+       movaps  -40(%r11),%xmm14
+       movaps  %xmm0,-40(%r11)
+       movaps  -24(%r11),%xmm15
+       movaps  %xmm0,-24(%r11)
        movaps  %xmm0,0(%rsp)
        movaps  %xmm0,16(%rsp)
        movaps  %xmm0,32(%rsp)
@@ -1715,12 +1743,15 @@ aesni_ctr32_encrypt_blocks:
        movaps  %xmm0,80(%rsp)
        movaps  %xmm0,96(%rsp)
        movaps  %xmm0,112(%rsp)
-       leaq    (%rbp),%rsp
-       popq    %rbp
+       movq    -8(%r11),%rbp
+
+       leaq    (%r11),%rsp
+
 .Lctr32_epilogue:
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
+
 .LSEH_end_aesni_ctr32_encrypt_blocks:
 .globl aesni_xts_encrypt
 .def   aesni_xts_encrypt;      .scl 2; .type 32;       .endef
@@ -1737,22 +1768,24 @@ aesni_xts_encrypt:
        movq    40(%rsp),%r8
        movq    48(%rsp),%r9
 
-       leaq    (%rsp),%rax
+
+       leaq    (%rsp),%r11
+
        pushq   %rbp
+
        subq    $272,%rsp
        andq    $-16,%rsp
-       movaps  %xmm6,-168(%rax)
-       movaps  %xmm7,-152(%rax)
-       movaps  %xmm8,-136(%rax)
-       movaps  %xmm9,-120(%rax)
-       movaps  %xmm10,-104(%rax)
-       movaps  %xmm11,-88(%rax)
-       movaps  %xmm12,-72(%rax)
-       movaps  %xmm13,-56(%rax)
-       movaps  %xmm14,-40(%rax)
-       movaps  %xmm15,-24(%rax)
+       movaps  %xmm6,-168(%r11)
+       movaps  %xmm7,-152(%r11)
+       movaps  %xmm8,-136(%r11)
+       movaps  %xmm9,-120(%r11)
+       movaps  %xmm10,-104(%r11)
+       movaps  %xmm11,-88(%r11)
+       movaps  %xmm12,-72(%r11)
+       movaps  %xmm13,-56(%r11)
+       movaps  %xmm14,-40(%r11)
+       movaps  %xmm15,-24(%r11)
 .Lxts_enc_body:
-       leaq    -8(%rax),%rbp
        movups  (%r9),%xmm2
        movl    240(%r8),%eax
        movl    240(%rcx),%r10d
@@ -1768,7 +1801,7 @@ aesni_xts_encrypt:
        jnz     .Loop_enc1_8
 .byte  102,15,56,221,209
        movups  (%rcx),%xmm0
-       movq    %rcx,%r11
+       movq    %rcx,%rbp
        movl    %r10d,%eax
        shll    $4,%r10d
        movq    %rdx,%r9
@@ -1824,9 +1857,9 @@ aesni_xts_encrypt:
        jc      .Lxts_enc_short
 
        movl    $16+96,%eax
-       leaq    32(%r11,%r10,1),%rcx
+       leaq    32(%rbp,%r10,1),%rcx
        subq    %r10,%rax
-       movups  16(%r11),%xmm1
+       movups  16(%rbp),%xmm1
        movq    %rax,%r10
        leaq    .Lxts_magic(%rip),%r8
        jmp     .Lxts_enc_grandloop
@@ -1851,7 +1884,7 @@ aesni_xts_encrypt:
        movdqa  96(%rsp),%xmm9
        pxor    %xmm14,%xmm6
 .byte  102,15,56,220,233
-       movups  32(%r11),%xmm0
+       movups  32(%rbp),%xmm0
        leaq    96(%rdi),%rdi
        pxor    %xmm8,%xmm7
 
@@ -1860,7 +1893,7 @@ aesni_xts_encrypt:
        pxor    %xmm9,%xmm11
        movdqa  %xmm10,0(%rsp)
 .byte  102,15,56,220,249
-       movups  48(%r11),%xmm1
+       movups  48(%rbp),%xmm1
        pxor    %xmm9,%xmm12
 
 .byte  102,15,56,220,208
@@ -1875,7 +1908,7 @@ aesni_xts_encrypt:
        movdqa  %xmm14,64(%rsp)
 .byte  102,15,56,220,240
 .byte  102,15,56,220,248
-       movups  64(%r11),%xmm0
+       movups  64(%rbp),%xmm0
        movdqa  %xmm8,80(%rsp)
        pshufd  $0x5f,%xmm15,%xmm9
        jmp     .Lxts_enc_loop6
@@ -1907,7 +1940,7 @@ aesni_xts_encrypt:
        psrad   $31,%xmm14
 .byte  102,15,56,220,217
        pand    %xmm8,%xmm14
-       movups  (%r11),%xmm10
+       movups  (%rbp),%xmm10
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
@@ -1975,10 +2008,10 @@ aesni_xts_encrypt:
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
        pxor    %xmm0,%xmm15
-       movups  (%r11),%xmm0
+       movups  (%rbp),%xmm0
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
-       movups  16(%r11),%xmm1
+       movups  16(%rbp),%xmm1
 
        pxor    %xmm15,%xmm14
 .byte  102,15,56,221,84,36,0
@@ -2005,7 +2038,7 @@ aesni_xts_encrypt:
 
        movl    $16+96,%eax
        subl    %r10d,%eax
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        shrl    $4,%eax
 
 .Lxts_enc_short:
@@ -2161,7 +2194,7 @@ aesni_xts_encrypt:
        jnz     .Lxts_enc_steal
 
        subq    %r9,%rsi
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        movl    %r10d,%eax
 
        movups  -16(%rsi),%xmm2
@@ -2187,26 +2220,26 @@ aesni_xts_encrypt:
        pxor    %xmm3,%xmm3
        pxor    %xmm4,%xmm4
        pxor    %xmm5,%xmm5
-       movaps  -160(%rbp),%xmm6
-       movaps  %xmm0,-160(%rbp)
-       movaps  -144(%rbp),%xmm7
-       movaps  %xmm0,-144(%rbp)
-       movaps  -128(%rbp),%xmm8
-       movaps  %xmm0,-128(%rbp)
-       movaps  -112(%rbp),%xmm9
-       movaps  %xmm0,-112(%rbp)
-       movaps  -96(%rbp),%xmm10
-       movaps  %xmm0,-96(%rbp)
-       movaps  -80(%rbp),%xmm11
-       movaps  %xmm0,-80(%rbp)
-       movaps  -64(%rbp),%xmm12
-       movaps  %xmm0,-64(%rbp)
-       movaps  -48(%rbp),%xmm13
-       movaps  %xmm0,-48(%rbp)
-       movaps  -32(%rbp),%xmm14
-       movaps  %xmm0,-32(%rbp)
-       movaps  -16(%rbp),%xmm15
-       movaps  %xmm0,-16(%rbp)
+       movaps  -168(%r11),%xmm6
+       movaps  %xmm0,-168(%r11)
+       movaps  -152(%r11),%xmm7
+       movaps  %xmm0,-152(%r11)
+       movaps  -136(%r11),%xmm8
+       movaps  %xmm0,-136(%r11)
+       movaps  -120(%r11),%xmm9
+       movaps  %xmm0,-120(%r11)
+       movaps  -104(%r11),%xmm10
+       movaps  %xmm0,-104(%r11)
+       movaps  -88(%r11),%xmm11
+       movaps  %xmm0,-88(%r11)
+       movaps  -72(%r11),%xmm12
+       movaps  %xmm0,-72(%r11)
+       movaps  -56(%r11),%xmm13
+       movaps  %xmm0,-56(%r11)
+       movaps  -40(%r11),%xmm14
+       movaps  %xmm0,-40(%r11)
+       movaps  -24(%r11),%xmm15
+       movaps  %xmm0,-24(%r11)
        movaps  %xmm0,0(%rsp)
        movaps  %xmm0,16(%rsp)
        movaps  %xmm0,32(%rsp)
@@ -2214,12 +2247,15 @@ aesni_xts_encrypt:
        movaps  %xmm0,64(%rsp)
        movaps  %xmm0,80(%rsp)
        movaps  %xmm0,96(%rsp)
-       leaq    (%rbp),%rsp
-       popq    %rbp
+       movq    -8(%r11),%rbp
+
+       leaq    (%r11),%rsp
+
 .Lxts_enc_epilogue:
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
+
 .LSEH_end_aesni_xts_encrypt:
 .globl aesni_xts_decrypt
 .def   aesni_xts_decrypt;      .scl 2; .type 32;       .endef
@@ -2236,22 +2272,24 @@ aesni_xts_decrypt:
        movq    40(%rsp),%r8
        movq    48(%rsp),%r9
 
-       leaq    (%rsp),%rax
+
+       leaq    (%rsp),%r11
+
        pushq   %rbp
+
        subq    $272,%rsp
        andq    $-16,%rsp
-       movaps  %xmm6,-168(%rax)
-       movaps  %xmm7,-152(%rax)
-       movaps  %xmm8,-136(%rax)
-       movaps  %xmm9,-120(%rax)
-       movaps  %xmm10,-104(%rax)
-       movaps  %xmm11,-88(%rax)
-       movaps  %xmm12,-72(%rax)
-       movaps  %xmm13,-56(%rax)
-       movaps  %xmm14,-40(%rax)
-       movaps  %xmm15,-24(%rax)
+       movaps  %xmm6,-168(%r11)
+       movaps  %xmm7,-152(%r11)
+       movaps  %xmm8,-136(%r11)
+       movaps  %xmm9,-120(%r11)
+       movaps  %xmm10,-104(%r11)
+       movaps  %xmm11,-88(%r11)
+       movaps  %xmm12,-72(%r11)
+       movaps  %xmm13,-56(%r11)
+       movaps  %xmm14,-40(%r11)
+       movaps  %xmm15,-24(%r11)
 .Lxts_dec_body:
-       leaq    -8(%rax),%rbp
        movups  (%r9),%xmm2
        movl    240(%r8),%eax
        movl    240(%rcx),%r10d
@@ -2273,7 +2311,7 @@ aesni_xts_decrypt:
        subq    %rax,%rdx
 
        movups  (%rcx),%xmm0
-       movq    %rcx,%r11
+       movq    %rcx,%rbp
        movl    %r10d,%eax
        shll    $4,%r10d
        movq    %rdx,%r9
@@ -2329,9 +2367,9 @@ aesni_xts_decrypt:
        jc      .Lxts_dec_short
 
        movl    $16+96,%eax
-       leaq    32(%r11,%r10,1),%rcx
+       leaq    32(%rbp,%r10,1),%rcx
        subq    %r10,%rax
-       movups  16(%r11),%xmm1
+       movups  16(%rbp),%xmm1
        movq    %rax,%r10
        leaq    .Lxts_magic(%rip),%r8
        jmp     .Lxts_dec_grandloop
@@ -2356,7 +2394,7 @@ aesni_xts_decrypt:
        movdqa  96(%rsp),%xmm9
        pxor    %xmm14,%xmm6
 .byte  102,15,56,222,233
-       movups  32(%r11),%xmm0
+       movups  32(%rbp),%xmm0
        leaq    96(%rdi),%rdi
        pxor    %xmm8,%xmm7
 
@@ -2365,7 +2403,7 @@ aesni_xts_decrypt:
        pxor    %xmm9,%xmm11
        movdqa  %xmm10,0(%rsp)
 .byte  102,15,56,222,249
-       movups  48(%r11),%xmm1
+       movups  48(%rbp),%xmm1
        pxor    %xmm9,%xmm12
 
 .byte  102,15,56,222,208
@@ -2380,7 +2418,7 @@ aesni_xts_decrypt:
        movdqa  %xmm14,64(%rsp)
 .byte  102,15,56,222,240
 .byte  102,15,56,222,248
-       movups  64(%r11),%xmm0
+       movups  64(%rbp),%xmm0
        movdqa  %xmm8,80(%rsp)
        pshufd  $0x5f,%xmm15,%xmm9
        jmp     .Lxts_dec_loop6
@@ -2412,7 +2450,7 @@ aesni_xts_decrypt:
        psrad   $31,%xmm14
 .byte  102,15,56,222,217
        pand    %xmm8,%xmm14
-       movups  (%r11),%xmm10
+       movups  (%rbp),%xmm10
 .byte  102,15,56,222,225
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
@@ -2480,10 +2518,10 @@ aesni_xts_decrypt:
 .byte  102,15,56,222,225
 .byte  102,15,56,222,233
        pxor    %xmm0,%xmm15
-       movups  (%r11),%xmm0
+       movups  (%rbp),%xmm0
 .byte  102,15,56,222,241
 .byte  102,15,56,222,249
-       movups  16(%r11),%xmm1
+       movups  16(%rbp),%xmm1
 
        pxor    %xmm15,%xmm14
 .byte  102,15,56,223,84,36,0
@@ -2510,7 +2548,7 @@ aesni_xts_decrypt:
 
        movl    $16+96,%eax
        subl    %r10d,%eax
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        shrl    $4,%eax
 
 .Lxts_dec_short:
@@ -2667,7 +2705,7 @@ aesni_xts_decrypt:
        jz      .Lxts_dec_ret
 .Lxts_dec_done2:
        movq    %r9,%rdx
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        movl    %r10d,%eax
 
        movups  (%rdi),%xmm2
@@ -2697,7 +2735,7 @@ aesni_xts_decrypt:
        jnz     .Lxts_dec_steal
 
        subq    %r9,%rsi
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        movl    %r10d,%eax
 
        movups  (%rsi),%xmm2
@@ -2723,26 +2761,26 @@ aesni_xts_decrypt:
        pxor    %xmm3,%xmm3
        pxor    %xmm4,%xmm4
        pxor    %xmm5,%xmm5
-       movaps  -160(%rbp),%xmm6
-       movaps  %xmm0,-160(%rbp)
-       movaps  -144(%rbp),%xmm7
-       movaps  %xmm0,-144(%rbp)
-       movaps  -128(%rbp),%xmm8
-       movaps  %xmm0,-128(%rbp)
-       movaps  -112(%rbp),%xmm9
-       movaps  %xmm0,-112(%rbp)
-       movaps  -96(%rbp),%xmm10
-       movaps  %xmm0,-96(%rbp)
-       movaps  -80(%rbp),%xmm11
-       movaps  %xmm0,-80(%rbp)
-       movaps  -64(%rbp),%xmm12
-       movaps  %xmm0,-64(%rbp)
-       movaps  -48(%rbp),%xmm13
-       movaps  %xmm0,-48(%rbp)
-       movaps  -32(%rbp),%xmm14
-       movaps  %xmm0,-32(%rbp)
-       movaps  -16(%rbp),%xmm15
-       movaps  %xmm0,-16(%rbp)
+       movaps  -168(%r11),%xmm6
+       movaps  %xmm0,-168(%r11)
+       movaps  -152(%r11),%xmm7
+       movaps  %xmm0,-152(%r11)
+       movaps  -136(%r11),%xmm8
+       movaps  %xmm0,-136(%r11)
+       movaps  -120(%r11),%xmm9
+       movaps  %xmm0,-120(%r11)
+       movaps  -104(%r11),%xmm10
+       movaps  %xmm0,-104(%r11)
+       movaps  -88(%r11),%xmm11
+       movaps  %xmm0,-88(%r11)
+       movaps  -72(%r11),%xmm12
+       movaps  %xmm0,-72(%r11)
+       movaps  -56(%r11),%xmm13
+       movaps  %xmm0,-56(%r11)
+       movaps  -40(%r11),%xmm14
+       movaps  %xmm0,-40(%r11)
+       movaps  -24(%r11),%xmm15
+       movaps  %xmm0,-24(%r11)
        movaps  %xmm0,0(%rsp)
        movaps  %xmm0,16(%rsp)
        movaps  %xmm0,32(%rsp)
@@ -2750,21 +2788,24 @@ aesni_xts_decrypt:
        movaps  %xmm0,64(%rsp)
        movaps  %xmm0,80(%rsp)
        movaps  %xmm0,96(%rsp)
-       leaq    (%rbp),%rsp
-       popq    %rbp
+       movq    -8(%r11),%rbp
+
+       leaq    (%r11),%rsp
+
 .Lxts_dec_epilogue:
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
+
 .LSEH_end_aesni_xts_decrypt:
-.globl aesni_cbc_encrypt
-.def   aesni_cbc_encrypt;      .scl 2; .type 32;       .endef
-.p2align       4
-aesni_cbc_encrypt:
+.globl aesni_ocb_encrypt
+.def   aesni_ocb_encrypt;      .scl 2; .type 32;       .endef
+.p2align       5
+aesni_ocb_encrypt:
        movq    %rdi,8(%rsp)
        movq    %rsi,16(%rsp)
        movq    %rsp,%rax
-.LSEH_begin_aesni_cbc_encrypt:
+.LSEH_begin_aesni_ocb_encrypt:
        movq    %rcx,%rdi
        movq    %rdx,%rsi
        movq    %r8,%rdx
@@ -2772,181 +2813,1096 @@ aesni_cbc_encrypt:
        movq    40(%rsp),%r8
        movq    48(%rsp),%r9
 
-       testq   %rdx,%rdx
-       jz      .Lcbc_ret
+
+       leaq    (%rsp),%rax
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       leaq    -160(%rsp),%rsp
+       movaps  %xmm6,0(%rsp)
+       movaps  %xmm7,16(%rsp)
+       movaps  %xmm8,32(%rsp)
+       movaps  %xmm9,48(%rsp)
+       movaps  %xmm10,64(%rsp)
+       movaps  %xmm11,80(%rsp)
+       movaps  %xmm12,96(%rsp)
+       movaps  %xmm13,112(%rsp)
+       movaps  %xmm14,128(%rsp)
+       movaps  %xmm15,144(%rsp)
+.Locb_enc_body:
+       movq    56(%rax),%rbx
+       movq    56+8(%rax),%rbp
 
        movl    240(%rcx),%r10d
        movq    %rcx,%r11
-       testl   %r9d,%r9d
-       jz      .Lcbc_decrypt
+       shll    $4,%r10d
+       movups  (%rcx),%xmm9
+       movups  16(%rcx,%r10,1),%xmm1
 
-       movups  (%r8),%xmm2
-       movl    %r10d,%eax
-       cmpq    $16,%rdx
-       jb      .Lcbc_enc_tail
-       subq    $16,%rdx
-       jmp     .Lcbc_enc_loop
-.p2align       4
-.Lcbc_enc_loop:
-       movups  (%rdi),%xmm3
-       leaq    16(%rdi),%rdi
+       movdqu  (%r9),%xmm15
+       pxor    %xmm1,%xmm9
+       pxor    %xmm1,%xmm15
 
-       movups  (%rcx),%xmm0
-       movups  16(%rcx),%xmm1
-       xorps   %xmm0,%xmm3
-       leaq    32(%rcx),%rcx
-       xorps   %xmm3,%xmm2
-.Loop_enc1_15:
-.byte  102,15,56,220,209
-       decl    %eax
-       movups  (%rcx),%xmm1
-       leaq    16(%rcx),%rcx
-       jnz     .Loop_enc1_15
-.byte  102,15,56,221,209
-       movl    %r10d,%eax
-       movq    %r11,%rcx
-       movups  %xmm2,0(%rsi)
-       leaq    16(%rsi),%rsi
-       subq    $16,%rdx
-       jnc     .Lcbc_enc_loop
-       addq    $16,%rdx
-       jnz     .Lcbc_enc_tail
-       pxor    %xmm0,%xmm0
-       pxor    %xmm1,%xmm1
-       movups  %xmm2,(%r8)
-       pxor    %xmm2,%xmm2
-       pxor    %xmm3,%xmm3
-       jmp     .Lcbc_ret
+       movl    $16+32,%eax
+       leaq    32(%r11,%r10,1),%rcx
+       movups  16(%r11),%xmm1
+       subq    %r10,%rax
+       movq    %rax,%r10
 
-.Lcbc_enc_tail:
-       movq    %rdx,%rcx
-       xchgq   %rdi,%rsi
-.long  0x9066A4F3
-       movl    $16,%ecx
-       subq    %rdx,%rcx
-       xorl    %eax,%eax
-.long  0x9066AAF3
-       leaq    -16(%rdi),%rdi
-       movl    %r10d,%eax
-       movq    %rdi,%rsi
-       movq    %r11,%rcx
-       xorq    %rdx,%rdx
-       jmp     .Lcbc_enc_loop
+       movdqu  (%rbx),%xmm10
+       movdqu  (%rbp),%xmm8
 
-.p2align       4
-.Lcbc_decrypt:
-       cmpq    $16,%rdx
-       jne     .Lcbc_decrypt_bulk
+       testq   $1,%r8
+       jnz     .Locb_enc_odd
 
+       bsfq    %r8,%r12
+       addq    $1,%r8
+       shlq    $4,%r12
+       movdqu  (%rbx,%r12,1),%xmm7
+       movdqu  (%rdi),%xmm2
+       leaq    16(%rdi),%rdi
 
+       call    __ocb_encrypt1
 
-       movdqu  (%rdi),%xmm2
-       movdqu  (%r8),%xmm3
-       movdqa  %xmm2,%xmm4
-       movups  (%rcx),%xmm0
-       movups  16(%rcx),%xmm1
-       leaq    32(%rcx),%rcx
-       xorps   %xmm0,%xmm2
-.Loop_dec1_16:
-.byte  102,15,56,222,209
-       decl    %r10d
-       movups  (%rcx),%xmm1
-       leaq    16(%rcx),%rcx
-       jnz     .Loop_dec1_16
-.byte  102,15,56,223,209
-       pxor    %xmm0,%xmm0
-       pxor    %xmm1,%xmm1
-       movdqu  %xmm4,(%r8)
-       xorps   %xmm3,%xmm2
-       pxor    %xmm3,%xmm3
+       movdqa  %xmm7,%xmm15
        movups  %xmm2,(%rsi)
-       pxor    %xmm2,%xmm2
-       jmp     .Lcbc_ret
-.p2align       4
-.Lcbc_decrypt_bulk:
-       leaq    (%rsp),%rax
-       pushq   %rbp
-       subq    $176,%rsp
-       andq    $-16,%rsp
-       movaps  %xmm6,16(%rsp)
-       movaps  %xmm7,32(%rsp)
-       movaps  %xmm8,48(%rsp)
-       movaps  %xmm9,64(%rsp)
-       movaps  %xmm10,80(%rsp)
-       movaps  %xmm11,96(%rsp)
-       movaps  %xmm12,112(%rsp)
-       movaps  %xmm13,128(%rsp)
-       movaps  %xmm14,144(%rsp)
-       movaps  %xmm15,160(%rsp)
-.Lcbc_decrypt_body:
-       leaq    -8(%rax),%rbp
-       movups  (%r8),%xmm10
-       movl    %r10d,%eax
-       cmpq    $0x50,%rdx
-       jbe     .Lcbc_dec_tail
+       leaq    16(%rsi),%rsi
+       subq    $1,%rdx
+       jz      .Locb_enc_done
+
+.Locb_enc_odd:
+       leaq    1(%r8),%r12
+       leaq    3(%r8),%r13
+       leaq    5(%r8),%r14
+       leaq    6(%r8),%r8
+       bsfq    %r12,%r12
+       bsfq    %r13,%r13
+       bsfq    %r14,%r14
+       shlq    $4,%r12
+       shlq    $4,%r13
+       shlq    $4,%r14
 
-       movups  (%rcx),%xmm0
+       subq    $6,%rdx
+       jc      .Locb_enc_short
+       jmp     .Locb_enc_grandloop
+
+.p2align       5
+.Locb_enc_grandloop:
        movdqu  0(%rdi),%xmm2
        movdqu  16(%rdi),%xmm3
-       movdqa  %xmm2,%xmm11
        movdqu  32(%rdi),%xmm4
-       movdqa  %xmm3,%xmm12
        movdqu  48(%rdi),%xmm5
-       movdqa  %xmm4,%xmm13
        movdqu  64(%rdi),%xmm6
-       movdqa  %xmm5,%xmm14
        movdqu  80(%rdi),%xmm7
-       movdqa  %xmm6,%xmm15
-       movl    _gnutls_x86_cpuid_s+4(%rip),%r9d
-       cmpq    $0x70,%rdx
-       jbe     .Lcbc_dec_six_or_seven
+       leaq    96(%rdi),%rdi
 
-       andl    $71303168,%r9d
-       subq    $0x50,%rdx
-       cmpl    $4194304,%r9d
-       je      .Lcbc_dec_loop6_enter
-       subq    $0x20,%rdx
-       leaq    112(%rcx),%rcx
-       jmp     .Lcbc_dec_loop8_enter
-.p2align       4
-.Lcbc_dec_loop8:
-       movups  %xmm9,(%rsi)
-       leaq    16(%rsi),%rsi
-.Lcbc_dec_loop8_enter:
-       movdqu  96(%rdi),%xmm8
-       pxor    %xmm0,%xmm2
-       movdqu  112(%rdi),%xmm9
-       pxor    %xmm0,%xmm3
-       movups  16-112(%rcx),%xmm1
-       pxor    %xmm0,%xmm4
-       xorq    %r11,%r11
-       cmpq    $0x70,%rdx
-       pxor    %xmm0,%xmm5
-       pxor    %xmm0,%xmm6
-       pxor    %xmm0,%xmm7
-       pxor    %xmm0,%xmm8
+       call    __ocb_encrypt6
 
-.byte  102,15,56,222,209
-       pxor    %xmm0,%xmm9
-       movups  32-112(%rcx),%xmm0
-.byte  102,15,56,222,217
-.byte  102,15,56,222,225
-.byte  102,15,56,222,233
-.byte  102,15,56,222,241
-.byte  102,15,56,222,249
-.byte  102,68,15,56,222,193
-       setnc   %r11b
-       shlq    $7,%r11
-.byte  102,68,15,56,222,201
-       addq    %rdi,%r11
-       movups  48-112(%rcx),%xmm1
-.byte  102,15,56,222,208
-.byte  102,15,56,222,216
-.byte  102,15,56,222,224
-.byte  102,15,56,222,232
-.byte  102,15,56,222,240
-.byte  102,15,56,222,248
-.byte  102,68,15,56,222,192
+       movups  %xmm2,0(%rsi)
+       movups  %xmm3,16(%rsi)
+       movups  %xmm4,32(%rsi)
+       movups  %xmm5,48(%rsi)
+       movups  %xmm6,64(%rsi)
+       movups  %xmm7,80(%rsi)
+       leaq    96(%rsi),%rsi
+       subq    $6,%rdx
+       jnc     .Locb_enc_grandloop
+
+.Locb_enc_short:
+       addq    $6,%rdx
+       jz      .Locb_enc_done
+
+       movdqu  0(%rdi),%xmm2
+       cmpq    $2,%rdx
+       jb      .Locb_enc_one
+       movdqu  16(%rdi),%xmm3
+       je      .Locb_enc_two
+
+       movdqu  32(%rdi),%xmm4
+       cmpq    $4,%rdx
+       jb      .Locb_enc_three
+       movdqu  48(%rdi),%xmm5
+       je      .Locb_enc_four
+
+       movdqu  64(%rdi),%xmm6
+       pxor    %xmm7,%xmm7
+
+       call    __ocb_encrypt6
+
+       movdqa  %xmm14,%xmm15
+       movups  %xmm2,0(%rsi)
+       movups  %xmm3,16(%rsi)
+       movups  %xmm4,32(%rsi)
+       movups  %xmm5,48(%rsi)
+       movups  %xmm6,64(%rsi)
+
+       jmp     .Locb_enc_done
+
+.p2align       4
+.Locb_enc_one:
+       movdqa  %xmm10,%xmm7
+
+       call    __ocb_encrypt1
+
+       movdqa  %xmm7,%xmm15
+       movups  %xmm2,0(%rsi)
+       jmp     .Locb_enc_done
+
+.p2align       4
+.Locb_enc_two:
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+
+       call    __ocb_encrypt4
+
+       movdqa  %xmm11,%xmm15
+       movups  %xmm2,0(%rsi)
+       movups  %xmm3,16(%rsi)
+
+       jmp     .Locb_enc_done
+
+.p2align       4
+.Locb_enc_three:
+       pxor    %xmm5,%xmm5
+
+       call    __ocb_encrypt4
+
+       movdqa  %xmm12,%xmm15
+       movups  %xmm2,0(%rsi)
+       movups  %xmm3,16(%rsi)
+       movups  %xmm4,32(%rsi)
+
+       jmp     .Locb_enc_done
+
+.p2align       4
+.Locb_enc_four:
+       call    __ocb_encrypt4
+
+       movdqa  %xmm13,%xmm15
+       movups  %xmm2,0(%rsi)
+       movups  %xmm3,16(%rsi)
+       movups  %xmm4,32(%rsi)
+       movups  %xmm5,48(%rsi)
+
+.Locb_enc_done:
+       pxor    %xmm0,%xmm15
+       movdqu  %xmm8,(%rbp)
+       movdqu  %xmm15,(%r9)
+
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       movaps  0(%rsp),%xmm6
+       movaps  %xmm0,0(%rsp)
+       movaps  16(%rsp),%xmm7
+       movaps  %xmm0,16(%rsp)
+       movaps  32(%rsp),%xmm8
+       movaps  %xmm0,32(%rsp)
+       movaps  48(%rsp),%xmm9
+       movaps  %xmm0,48(%rsp)
+       movaps  64(%rsp),%xmm10
+       movaps  %xmm0,64(%rsp)
+       movaps  80(%rsp),%xmm11
+       movaps  %xmm0,80(%rsp)
+       movaps  96(%rsp),%xmm12
+       movaps  %xmm0,96(%rsp)
+       movaps  112(%rsp),%xmm13
+       movaps  %xmm0,112(%rsp)
+       movaps  128(%rsp),%xmm14
+       movaps  %xmm0,128(%rsp)
+       movaps  144(%rsp),%xmm15
+       movaps  %xmm0,144(%rsp)
+       leaq    160+40(%rsp),%rax
+.Locb_enc_pop:
+       movq    -40(%rax),%r14
+
+       movq    -32(%rax),%r13
+
+       movq    -24(%rax),%r12
+
+       movq    -16(%rax),%rbp
+
+       movq    -8(%rax),%rbx
+
+       leaq    (%rax),%rsp
+
+.Locb_enc_epilogue:
+       movq    8(%rsp),%rdi
+       movq    16(%rsp),%rsi
+       .byte   0xf3,0xc3
+
+.LSEH_end_aesni_ocb_encrypt:
+
+.def   __ocb_encrypt6; .scl 3; .type 32;       .endef
+.p2align       5
+__ocb_encrypt6:
+       pxor    %xmm9,%xmm15
+       movdqu  (%rbx,%r12,1),%xmm11
+       movdqa  %xmm10,%xmm12
+       movdqu  (%rbx,%r13,1),%xmm13
+       movdqa  %xmm10,%xmm14
+       pxor    %xmm15,%xmm10
+       movdqu  (%rbx,%r14,1),%xmm15
+       pxor    %xmm10,%xmm11
+       pxor    %xmm2,%xmm8
+       pxor    %xmm10,%xmm2
+       pxor    %xmm11,%xmm12
+       pxor    %xmm3,%xmm8
+       pxor    %xmm11,%xmm3
+       pxor    %xmm12,%xmm13
+       pxor    %xmm4,%xmm8
+       pxor    %xmm12,%xmm4
+       pxor    %xmm13,%xmm14
+       pxor    %xmm5,%xmm8
+       pxor    %xmm13,%xmm5
+       pxor    %xmm14,%xmm15
+       pxor    %xmm6,%xmm8
+       pxor    %xmm14,%xmm6
+       pxor    %xmm7,%xmm8
+       pxor    %xmm15,%xmm7
+       movups  32(%r11),%xmm0
+
+       leaq    1(%r8),%r12
+       leaq    3(%r8),%r13
+       leaq    5(%r8),%r14
+       addq    $6,%r8
+       pxor    %xmm9,%xmm10
+       bsfq    %r12,%r12
+       bsfq    %r13,%r13
+       bsfq    %r14,%r14
+
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       pxor    %xmm9,%xmm11
+       pxor    %xmm9,%xmm12
+.byte  102,15,56,220,241
+       pxor    %xmm9,%xmm13
+       pxor    %xmm9,%xmm14
+.byte  102,15,56,220,249
+       movups  48(%r11),%xmm1
+       pxor    %xmm9,%xmm15
+
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+.byte  102,15,56,220,240
+.byte  102,15,56,220,248
+       movups  64(%r11),%xmm0
+       shlq    $4,%r12
+       shlq    $4,%r13
+       jmp     .Locb_enc_loop6
+
+.p2align       5
+.Locb_enc_loop6:
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+.byte  102,15,56,220,241
+.byte  102,15,56,220,249
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+.byte  102,15,56,220,240
+.byte  102,15,56,220,248
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     .Locb_enc_loop6
+
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+.byte  102,15,56,220,241
+.byte  102,15,56,220,249
+       movups  16(%r11),%xmm1
+       shlq    $4,%r14
+
+.byte  102,65,15,56,221,210
+       movdqu  (%rbx),%xmm10
+       movq    %r10,%rax
+.byte  102,65,15,56,221,219
+.byte  102,65,15,56,221,228
+.byte  102,65,15,56,221,237
+.byte  102,65,15,56,221,246
+.byte  102,65,15,56,221,255
+       .byte   0xf3,0xc3
+
+
+.def   __ocb_encrypt4; .scl 3; .type 32;       .endef
+.p2align       5
+__ocb_encrypt4:
+       pxor    %xmm9,%xmm15
+       movdqu  (%rbx,%r12,1),%xmm11
+       movdqa  %xmm10,%xmm12
+       movdqu  (%rbx,%r13,1),%xmm13
+       pxor    %xmm15,%xmm10
+       pxor    %xmm10,%xmm11
+       pxor    %xmm2,%xmm8
+       pxor    %xmm10,%xmm2
+       pxor    %xmm11,%xmm12
+       pxor    %xmm3,%xmm8
+       pxor    %xmm11,%xmm3
+       pxor    %xmm12,%xmm13
+       pxor    %xmm4,%xmm8
+       pxor    %xmm12,%xmm4
+       pxor    %xmm5,%xmm8
+       pxor    %xmm13,%xmm5
+       movups  32(%r11),%xmm0
+
+       pxor    %xmm9,%xmm10
+       pxor    %xmm9,%xmm11
+       pxor    %xmm9,%xmm12
+       pxor    %xmm9,%xmm13
+
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       movups  48(%r11),%xmm1
+
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+       movups  64(%r11),%xmm0
+       jmp     .Locb_enc_loop4
+
+.p2align       5
+.Locb_enc_loop4:
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     .Locb_enc_loop4
+
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       movups  16(%r11),%xmm1
+       movq    %r10,%rax
+
+.byte  102,65,15,56,221,210
+.byte  102,65,15,56,221,219
+.byte  102,65,15,56,221,228
+.byte  102,65,15,56,221,237
+       .byte   0xf3,0xc3
+
+
+.def   __ocb_encrypt1; .scl 3; .type 32;       .endef
+.p2align       5
+__ocb_encrypt1:
+       pxor    %xmm15,%xmm7
+       pxor    %xmm9,%xmm7
+       pxor    %xmm2,%xmm8
+       pxor    %xmm7,%xmm2
+       movups  32(%r11),%xmm0
+
+.byte  102,15,56,220,209
+       movups  48(%r11),%xmm1
+       pxor    %xmm9,%xmm7
+
+.byte  102,15,56,220,208
+       movups  64(%r11),%xmm0
+       jmp     .Locb_enc_loop1
+
+.p2align       5
+.Locb_enc_loop1:
+.byte  102,15,56,220,209
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,220,208
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     .Locb_enc_loop1
+
+.byte  102,15,56,220,209
+       movups  16(%r11),%xmm1
+       movq    %r10,%rax
+
+.byte  102,15,56,221,215
+       .byte   0xf3,0xc3
+
+
+.globl aesni_ocb_decrypt
+.def   aesni_ocb_decrypt;      .scl 2; .type 32;       .endef
+.p2align       5
+aesni_ocb_decrypt:
+       movq    %rdi,8(%rsp)
+       movq    %rsi,16(%rsp)
+       movq    %rsp,%rax
+.LSEH_begin_aesni_ocb_decrypt:
+       movq    %rcx,%rdi
+       movq    %rdx,%rsi
+       movq    %r8,%rdx
+       movq    %r9,%rcx
+       movq    40(%rsp),%r8
+       movq    48(%rsp),%r9
+
+
+       leaq    (%rsp),%rax
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       leaq    -160(%rsp),%rsp
+       movaps  %xmm6,0(%rsp)
+       movaps  %xmm7,16(%rsp)
+       movaps  %xmm8,32(%rsp)
+       movaps  %xmm9,48(%rsp)
+       movaps  %xmm10,64(%rsp)
+       movaps  %xmm11,80(%rsp)
+       movaps  %xmm12,96(%rsp)
+       movaps  %xmm13,112(%rsp)
+       movaps  %xmm14,128(%rsp)
+       movaps  %xmm15,144(%rsp)
+.Locb_dec_body:
+       movq    56(%rax),%rbx
+       movq    56+8(%rax),%rbp
+
+       movl    240(%rcx),%r10d
+       movq    %rcx,%r11
+       shll    $4,%r10d
+       movups  (%rcx),%xmm9
+       movups  16(%rcx,%r10,1),%xmm1
+
+       movdqu  (%r9),%xmm15
+       pxor    %xmm1,%xmm9
+       pxor    %xmm1,%xmm15
+
+       movl    $16+32,%eax
+       leaq    32(%r11,%r10,1),%rcx
+       movups  16(%r11),%xmm1
+       subq    %r10,%rax
+       movq    %rax,%r10
+
+       movdqu  (%rbx),%xmm10
+       movdqu  (%rbp),%xmm8
+
+       testq   $1,%r8
+       jnz     .Locb_dec_odd
+
+       bsfq    %r8,%r12
+       addq    $1,%r8
+       shlq    $4,%r12
+       movdqu  (%rbx,%r12,1),%xmm7
+       movdqu  (%rdi),%xmm2
+       leaq    16(%rdi),%rdi
+
+       call    __ocb_decrypt1
+
+       movdqa  %xmm7,%xmm15
+       movups  %xmm2,(%rsi)
+       xorps   %xmm2,%xmm8
+       leaq    16(%rsi),%rsi
+       subq    $1,%rdx
+       jz      .Locb_dec_done
+
+.Locb_dec_odd:
+       leaq    1(%r8),%r12
+       leaq    3(%r8),%r13
+       leaq    5(%r8),%r14
+       leaq    6(%r8),%r8
+       bsfq    %r12,%r12
+       bsfq    %r13,%r13
+       bsfq    %r14,%r14
+       shlq    $4,%r12
+       shlq    $4,%r13
+       shlq    $4,%r14
+
+       subq    $6,%rdx
+       jc      .Locb_dec_short
+       jmp     .Locb_dec_grandloop
+
+.p2align       5
+.Locb_dec_grandloop:
+       movdqu  0(%rdi),%xmm2
+       movdqu  16(%rdi),%xmm3
+       movdqu  32(%rdi),%xmm4
+       movdqu  48(%rdi),%xmm5
+       movdqu  64(%rdi),%xmm6
+       movdqu  80(%rdi),%xmm7
+       leaq    96(%rdi),%rdi
+
+       call    __ocb_decrypt6
+
+       movups  %xmm2,0(%rsi)
+       pxor    %xmm2,%xmm8
+       movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm8
+       movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm8
+       movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm8
+       movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm8
+       movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm8
+       leaq    96(%rsi),%rsi
+       subq    $6,%rdx
+       jnc     .Locb_dec_grandloop
+
+.Locb_dec_short:
+       addq    $6,%rdx
+       jz      .Locb_dec_done
+
+       movdqu  0(%rdi),%xmm2
+       cmpq    $2,%rdx
+       jb      .Locb_dec_one
+       movdqu  16(%rdi),%xmm3
+       je      .Locb_dec_two
+
+       movdqu  32(%rdi),%xmm4
+       cmpq    $4,%rdx
+       jb      .Locb_dec_three
+       movdqu  48(%rdi),%xmm5
+       je      .Locb_dec_four
+
+       movdqu  64(%rdi),%xmm6
+       pxor    %xmm7,%xmm7
+
+       call    __ocb_decrypt6
+
+       movdqa  %xmm14,%xmm15
+       movups  %xmm2,0(%rsi)
+       pxor    %xmm2,%xmm8
+       movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm8
+       movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm8
+       movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm8
+       movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm8
+
+       jmp     .Locb_dec_done
+
+.p2align       4
+.Locb_dec_one:
+       movdqa  %xmm10,%xmm7
+
+       call    __ocb_decrypt1
+
+       movdqa  %xmm7,%xmm15
+       movups  %xmm2,0(%rsi)
+       xorps   %xmm2,%xmm8
+       jmp     .Locb_dec_done
+
+.p2align       4
+.Locb_dec_two:
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+
+       call    __ocb_decrypt4
+
+       movdqa  %xmm11,%xmm15
+       movups  %xmm2,0(%rsi)
+       xorps   %xmm2,%xmm8
+       movups  %xmm3,16(%rsi)
+       xorps   %xmm3,%xmm8
+
+       jmp     .Locb_dec_done
+
+.p2align       4
+.Locb_dec_three:
+       pxor    %xmm5,%xmm5
+
+       call    __ocb_decrypt4
+
+       movdqa  %xmm12,%xmm15
+       movups  %xmm2,0(%rsi)
+       xorps   %xmm2,%xmm8
+       movups  %xmm3,16(%rsi)
+       xorps   %xmm3,%xmm8
+       movups  %xmm4,32(%rsi)
+       xorps   %xmm4,%xmm8
+
+       jmp     .Locb_dec_done
+
+.p2align       4
+.Locb_dec_four:
+       call    __ocb_decrypt4
+
+       movdqa  %xmm13,%xmm15
+       movups  %xmm2,0(%rsi)
+       pxor    %xmm2,%xmm8
+       movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm8
+       movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm8
+       movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm8
+
+.Locb_dec_done:
+       pxor    %xmm0,%xmm15
+       movdqu  %xmm8,(%rbp)
+       movdqu  %xmm15,(%r9)
+
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       movaps  0(%rsp),%xmm6
+       movaps  %xmm0,0(%rsp)
+       movaps  16(%rsp),%xmm7
+       movaps  %xmm0,16(%rsp)
+       movaps  32(%rsp),%xmm8
+       movaps  %xmm0,32(%rsp)
+       movaps  48(%rsp),%xmm9
+       movaps  %xmm0,48(%rsp)
+       movaps  64(%rsp),%xmm10
+       movaps  %xmm0,64(%rsp)
+       movaps  80(%rsp),%xmm11
+       movaps  %xmm0,80(%rsp)
+       movaps  96(%rsp),%xmm12
+       movaps  %xmm0,96(%rsp)
+       movaps  112(%rsp),%xmm13
+       movaps  %xmm0,112(%rsp)
+       movaps  128(%rsp),%xmm14
+       movaps  %xmm0,128(%rsp)
+       movaps  144(%rsp),%xmm15
+       movaps  %xmm0,144(%rsp)
+       leaq    160+40(%rsp),%rax
+.Locb_dec_pop:
+       movq    -40(%rax),%r14
+
+       movq    -32(%rax),%r13
+
+       movq    -24(%rax),%r12
+
+       movq    -16(%rax),%rbp
+
+       movq    -8(%rax),%rbx
+
+       leaq    (%rax),%rsp
+
+.Locb_dec_epilogue:
+       movq    8(%rsp),%rdi
+       movq    16(%rsp),%rsi
+       .byte   0xf3,0xc3
+
+.LSEH_end_aesni_ocb_decrypt:
+
+.def   __ocb_decrypt6; .scl 3; .type 32;       .endef
+.p2align       5
+__ocb_decrypt6:
+       pxor    %xmm9,%xmm15
+       movdqu  (%rbx,%r12,1),%xmm11
+       movdqa  %xmm10,%xmm12
+       movdqu  (%rbx,%r13,1),%xmm13
+       movdqa  %xmm10,%xmm14
+       pxor    %xmm15,%xmm10
+       movdqu  (%rbx,%r14,1),%xmm15
+       pxor    %xmm10,%xmm11
+       pxor    %xmm10,%xmm2
+       pxor    %xmm11,%xmm12
+       pxor    %xmm11,%xmm3
+       pxor    %xmm12,%xmm13
+       pxor    %xmm12,%xmm4
+       pxor    %xmm13,%xmm14
+       pxor    %xmm13,%xmm5
+       pxor    %xmm14,%xmm15
+       pxor    %xmm14,%xmm6
+       pxor    %xmm15,%xmm7
+       movups  32(%r11),%xmm0
+
+       leaq    1(%r8),%r12
+       leaq    3(%r8),%r13
+       leaq    5(%r8),%r14
+       addq    $6,%r8
+       pxor    %xmm9,%xmm10
+       bsfq    %r12,%r12
+       bsfq    %r13,%r13
+       bsfq    %r14,%r14
+
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+       pxor    %xmm9,%xmm11
+       pxor    %xmm9,%xmm12
+.byte  102,15,56,222,241
+       pxor    %xmm9,%xmm13
+       pxor    %xmm9,%xmm14
+.byte  102,15,56,222,249
+       movups  48(%r11),%xmm1
+       pxor    %xmm9,%xmm15
+
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+.byte  102,15,56,222,240
+.byte  102,15,56,222,248
+       movups  64(%r11),%xmm0
+       shlq    $4,%r12
+       shlq    $4,%r13
+       jmp     .Locb_dec_loop6
+
+.p2align       5
+.Locb_dec_loop6:
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+.byte  102,15,56,222,241
+.byte  102,15,56,222,249
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+.byte  102,15,56,222,240
+.byte  102,15,56,222,248
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     .Locb_dec_loop6
+
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+.byte  102,15,56,222,241
+.byte  102,15,56,222,249
+       movups  16(%r11),%xmm1
+       shlq    $4,%r14
+
+.byte  102,65,15,56,223,210
+       movdqu  (%rbx),%xmm10
+       movq    %r10,%rax
+.byte  102,65,15,56,223,219
+.byte  102,65,15,56,223,228
+.byte  102,65,15,56,223,237
+.byte  102,65,15,56,223,246
+.byte  102,65,15,56,223,255
+       .byte   0xf3,0xc3
+
+
+.def   __ocb_decrypt4; .scl 3; .type 32;       .endef
+.p2align       5
+__ocb_decrypt4:
+       pxor    %xmm9,%xmm15
+       movdqu  (%rbx,%r12,1),%xmm11
+       movdqa  %xmm10,%xmm12
+       movdqu  (%rbx,%r13,1),%xmm13
+       pxor    %xmm15,%xmm10
+       pxor    %xmm10,%xmm11
+       pxor    %xmm10,%xmm2
+       pxor    %xmm11,%xmm12
+       pxor    %xmm11,%xmm3
+       pxor    %xmm12,%xmm13
+       pxor    %xmm12,%xmm4
+       pxor    %xmm13,%xmm5
+       movups  32(%r11),%xmm0
+
+       pxor    %xmm9,%xmm10
+       pxor    %xmm9,%xmm11
+       pxor    %xmm9,%xmm12
+       pxor    %xmm9,%xmm13
+
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+       movups  48(%r11),%xmm1
+
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+       movups  64(%r11),%xmm0
+       jmp     .Locb_dec_loop4
+
+.p2align       5
+.Locb_dec_loop4:
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     .Locb_dec_loop4
+
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+       movups  16(%r11),%xmm1
+       movq    %r10,%rax
+
+.byte  102,65,15,56,223,210
+.byte  102,65,15,56,223,219
+.byte  102,65,15,56,223,228
+.byte  102,65,15,56,223,237
+       .byte   0xf3,0xc3
+
+
+.def   __ocb_decrypt1; .scl 3; .type 32;       .endef
+.p2align       5
+__ocb_decrypt1:
+       pxor    %xmm15,%xmm7
+       pxor    %xmm9,%xmm7
+       pxor    %xmm7,%xmm2
+       movups  32(%r11),%xmm0
+
+.byte  102,15,56,222,209
+       movups  48(%r11),%xmm1
+       pxor    %xmm9,%xmm7
+
+.byte  102,15,56,222,208
+       movups  64(%r11),%xmm0
+       jmp     .Locb_dec_loop1
+
+.p2align       5
+.Locb_dec_loop1:
+.byte  102,15,56,222,209
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,222,208
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     .Locb_dec_loop1
+
+.byte  102,15,56,222,209
+       movups  16(%r11),%xmm1
+       movq    %r10,%rax
+
+.byte  102,15,56,223,215
+       .byte   0xf3,0xc3
+
+.globl aesni_cbc_encrypt
+.def   aesni_cbc_encrypt;      .scl 2; .type 32;       .endef
+.p2align       4
+aesni_cbc_encrypt:
+       movq    %rdi,8(%rsp)
+       movq    %rsi,16(%rsp)
+       movq    %rsp,%rax
+.LSEH_begin_aesni_cbc_encrypt:
+       movq    %rcx,%rdi
+       movq    %rdx,%rsi
+       movq    %r8,%rdx
+       movq    %r9,%rcx
+       movq    40(%rsp),%r8
+       movq    48(%rsp),%r9
+
+
+       testq   %rdx,%rdx
+       jz      .Lcbc_ret
+
+       movl    240(%rcx),%r10d
+       movq    %rcx,%r11
+       testl   %r9d,%r9d
+       jz      .Lcbc_decrypt
+
+       movups  (%r8),%xmm2
+       movl    %r10d,%eax
+       cmpq    $16,%rdx
+       jb      .Lcbc_enc_tail
+       subq    $16,%rdx
+       jmp     .Lcbc_enc_loop
+.p2align       4
+.Lcbc_enc_loop:
+       movups  (%rdi),%xmm3
+       leaq    16(%rdi),%rdi
+
+       movups  (%rcx),%xmm0
+       movups  16(%rcx),%xmm1
+       xorps   %xmm0,%xmm3
+       leaq    32(%rcx),%rcx
+       xorps   %xmm3,%xmm2
+.Loop_enc1_15:
+.byte  102,15,56,220,209
+       decl    %eax
+       movups  (%rcx),%xmm1
+       leaq    16(%rcx),%rcx
+       jnz     .Loop_enc1_15
+.byte  102,15,56,221,209
+       movl    %r10d,%eax
+       movq    %r11,%rcx
+       movups  %xmm2,0(%rsi)
+       leaq    16(%rsi),%rsi
+       subq    $16,%rdx
+       jnc     .Lcbc_enc_loop
+       addq    $16,%rdx
+       jnz     .Lcbc_enc_tail
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       movups  %xmm2,(%r8)
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       jmp     .Lcbc_ret
+
+.Lcbc_enc_tail:
+       movq    %rdx,%rcx
+       xchgq   %rdi,%rsi
+.long  0x9066A4F3
+       movl    $16,%ecx
+       subq    %rdx,%rcx
+       xorl    %eax,%eax
+.long  0x9066AAF3
+       leaq    -16(%rdi),%rdi
+       movl    %r10d,%eax
+       movq    %rdi,%rsi
+       movq    %r11,%rcx
+       xorq    %rdx,%rdx
+       jmp     .Lcbc_enc_loop
+
+.p2align       4
+.Lcbc_decrypt:
+       cmpq    $16,%rdx
+       jne     .Lcbc_decrypt_bulk
+
+
+
+       movdqu  (%rdi),%xmm2
+       movdqu  (%r8),%xmm3
+       movdqa  %xmm2,%xmm4
+       movups  (%rcx),%xmm0
+       movups  16(%rcx),%xmm1
+       leaq    32(%rcx),%rcx
+       xorps   %xmm0,%xmm2
+.Loop_dec1_16:
+.byte  102,15,56,222,209
+       decl    %r10d
+       movups  (%rcx),%xmm1
+       leaq    16(%rcx),%rcx
+       jnz     .Loop_dec1_16
+.byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       movdqu  %xmm4,(%r8)
+       xorps   %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
+       movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
+       jmp     .Lcbc_ret
+.p2align       4
+.Lcbc_decrypt_bulk:
+       leaq    (%rsp),%r11
+
+       pushq   %rbp
+
+       subq    $176,%rsp
+       andq    $-16,%rsp
+       movaps  %xmm6,16(%rsp)
+       movaps  %xmm7,32(%rsp)
+       movaps  %xmm8,48(%rsp)
+       movaps  %xmm9,64(%rsp)
+       movaps  %xmm10,80(%rsp)
+       movaps  %xmm11,96(%rsp)
+       movaps  %xmm12,112(%rsp)
+       movaps  %xmm13,128(%rsp)
+       movaps  %xmm14,144(%rsp)
+       movaps  %xmm15,160(%rsp)
+.Lcbc_decrypt_body:
+       movq    %rcx,%rbp
+       movups  (%r8),%xmm10
+       movl    %r10d,%eax
+       cmpq    $0x50,%rdx
+       jbe     .Lcbc_dec_tail
+
+       movups  (%rcx),%xmm0
+       movdqu  0(%rdi),%xmm2
+       movdqu  16(%rdi),%xmm3
+       movdqa  %xmm2,%xmm11
+       movdqu  32(%rdi),%xmm4
+       movdqa  %xmm3,%xmm12
+       movdqu  48(%rdi),%xmm5
+       movdqa  %xmm4,%xmm13
+       movdqu  64(%rdi),%xmm6
+       movdqa  %xmm5,%xmm14
+       movdqu  80(%rdi),%xmm7
+       movdqa  %xmm6,%xmm15
+       movl    _gnutls_x86_cpuid_s+4(%rip),%r9d
+       cmpq    $0x70,%rdx
+       jbe     .Lcbc_dec_six_or_seven
+
+       andl    $71303168,%r9d
+       subq    $0x50,%rdx
+       cmpl    $4194304,%r9d
+       je      .Lcbc_dec_loop6_enter
+       subq    $0x20,%rdx
+       leaq    112(%rcx),%rcx
+       jmp     .Lcbc_dec_loop8_enter
+.p2align       4
+.Lcbc_dec_loop8:
+       movups  %xmm9,(%rsi)
+       leaq    16(%rsi),%rsi
+.Lcbc_dec_loop8_enter:
+       movdqu  96(%rdi),%xmm8
+       pxor    %xmm0,%xmm2
+       movdqu  112(%rdi),%xmm9
+       pxor    %xmm0,%xmm3
+       movups  16-112(%rcx),%xmm1
+       pxor    %xmm0,%xmm4
+       movq    $-1,%rbp
+       cmpq    $0x70,%rdx
+       pxor    %xmm0,%xmm5
+       pxor    %xmm0,%xmm6
+       pxor    %xmm0,%xmm7
+       pxor    %xmm0,%xmm8
+
+.byte  102,15,56,222,209
+       pxor    %xmm0,%xmm9
+       movups  32-112(%rcx),%xmm0
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+.byte  102,15,56,222,241
+.byte  102,15,56,222,249
+.byte  102,68,15,56,222,193
+       adcq    $0,%rbp
+       andq    $128,%rbp
+.byte  102,68,15,56,222,201
+       addq    %rdi,%rbp
+       movups  48-112(%rcx),%xmm1
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+.byte  102,15,56,222,240
+.byte  102,15,56,222,248
+.byte  102,68,15,56,222,192
 .byte  102,68,15,56,222,200
        movups  64-112(%rcx),%xmm0
        nop
@@ -3076,18 +4032,18 @@ aesni_cbc_encrypt:
        movdqu  112(%rdi),%xmm0
 .byte  102,65,15,56,223,228
        leaq    128(%rdi),%rdi
-       movdqu  0(%r11),%xmm11
+       movdqu  0(%rbp),%xmm11
 .byte  102,65,15,56,223,237
 .byte  102,65,15,56,223,246
-       movdqu  16(%r11),%xmm12
-       movdqu  32(%r11),%xmm13
+       movdqu  16(%rbp),%xmm12
+       movdqu  32(%rbp),%xmm13
 .byte  102,65,15,56,223,255
 .byte  102,68,15,56,223,193
-       movdqu  48(%r11),%xmm14
-       movdqu  64(%r11),%xmm15
+       movdqu  48(%rbp),%xmm14
+       movdqu  64(%rbp),%xmm15
 .byte  102,69,15,56,223,202
        movdqa  %xmm0,%xmm10
-       movdqu  80(%r11),%xmm1
+       movdqu  80(%rbp),%xmm1
        movups  -112(%rcx),%xmm0
 
        movups  %xmm2,(%rsi)
@@ -3206,7 +4162,7 @@ aesni_cbc_encrypt:
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
        pxor    %xmm14,%xmm6
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        movdqu  %xmm5,48(%rsi)
        pxor    %xmm15,%xmm7
        movl    %r10d,%eax
@@ -3375,18 +4331,23 @@ aesni_cbc_encrypt:
        movaps  %xmm0,144(%rsp)
        movaps  160(%rsp),%xmm15
        movaps  %xmm0,160(%rsp)
-       leaq    (%rbp),%rsp
-       popq    %rbp
+       movq    -8(%r11),%rbp
+
+       leaq    (%r11),%rsp
+
 .Lcbc_ret:
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
+
 .LSEH_end_aesni_cbc_encrypt:
 .globl aesni_set_decrypt_key
 .def   aesni_set_decrypt_key;  .scl 2; .type 32;       .endef
 .p2align       4
 aesni_set_decrypt_key:
+
 .byte  0x48,0x83,0xEC,0x08
+
        call    __aesni_set_encrypt_key
        shll    $4,%edx
        testl   %eax,%eax
@@ -3419,7 +4380,9 @@ aesni_set_decrypt_key:
        pxor    %xmm0,%xmm0
 .Ldec_key_ret:
        addq    $8,%rsp
+
        .byte   0xf3,0xc3
+
 .LSEH_end_set_decrypt_key:
 
 .globl aesni_set_encrypt_key
@@ -3427,7 +4390,9 @@ aesni_set_decrypt_key:
 .p2align       4
 aesni_set_encrypt_key:
 __aesni_set_encrypt_key:
+
 .byte  0x48,0x83,0xEC,0x08
+
        movq    $-1,%rax
        testq   %rcx,%rcx
        jz      .Lenc_key_ret
@@ -3720,7 +4685,9 @@ __aesni_set_encrypt_key:
        pxor    %xmm4,%xmm4
        pxor    %xmm5,%xmm5
        addq    $8,%rsp
+
        .byte   0xf3,0xc3
+
 .LSEH_end_set_encrypt_key:
 
 .p2align       4
@@ -3889,13 +4856,75 @@ ctr_xts_se_handler:
        cmpq    %r10,%rbx
        jae     .Lcommon_seh_tail
 
-       movq    160(%r8),%rax
-       leaq    -160(%rax),%rsi
+       movq    208(%r8),%rax
+
+       leaq    -168(%rax),%rsi
+       leaq    512(%r8),%rdi
+       movl    $20,%ecx
+.long  0xa548f3fc
+
+       movq    -8(%rax),%rbp
+       movq    %rbp,160(%r8)
+       jmp     .Lcommon_seh_tail
+
+
+.def   ocb_se_handler; .scl 3; .type 32;       .endef
+.p2align       4
+ocb_se_handler:
+       pushq   %rsi
+       pushq   %rdi
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       pushfq
+       subq    $64,%rsp
+
+       movq    120(%r8),%rax
+       movq    248(%r8),%rbx
+
+       movq    8(%r9),%rsi
+       movq    56(%r9),%r11
+
+       movl    0(%r11),%r10d
+       leaq    (%rsi,%r10,1),%r10
+       cmpq    %r10,%rbx
+       jb      .Lcommon_seh_tail
+
+       movl    4(%r11),%r10d
+       leaq    (%rsi,%r10,1),%r10
+       cmpq    %r10,%rbx
+       jae     .Lcommon_seh_tail
+
+       movl    8(%r11),%r10d
+       leaq    (%rsi,%r10,1),%r10
+       cmpq    %r10,%rbx
+       jae     .Locb_no_xmm
+
+       movq    152(%r8),%rax
+
+       leaq    (%rax),%rsi
        leaq    512(%r8),%rdi
        movl    $20,%ecx
 .long  0xa548f3fc
+       leaq    160+40(%rax),%rax
+
+.Locb_no_xmm:
+       movq    -8(%rax),%rbx
+       movq    -16(%rax),%rbp
+       movq    -24(%rax),%r12
+       movq    -32(%rax),%r13
+       movq    -40(%rax),%r14
 
-       jmp     .Lcommon_rbp_tail
+       movq    %rbx,144(%r8)
+       movq    %rbp,160(%r8)
+       movq    %r12,216(%r8)
+       movq    %r13,224(%r8)
+       movq    %r14,232(%r8)
+
+       jmp     .Lcommon_seh_tail
 
 .def   cbc_se_handler; .scl 3; .type 32;       .endef
 .p2align       4
@@ -3918,9 +4947,13 @@ cbc_se_handler:
        cmpq    %r10,%rbx
        jb      .Lcommon_seh_tail
 
+       movq    120(%r8),%rax
+
        leaq    .Lcbc_decrypt_body(%rip),%r10
        cmpq    %r10,%rbx
-       jb      .Lrestore_cbc_rax
+       jb      .Lcommon_seh_tail
+
+       movq    152(%r8),%rax
 
        leaq    .Lcbc_ret(%rip),%r10
        cmpq    %r10,%rbx
@@ -3931,15 +4964,10 @@ cbc_se_handler:
        movl    $20,%ecx
 .long  0xa548f3fc
 
-.Lcommon_rbp_tail:
-       movq    160(%r8),%rax
-       movq    (%rax),%rbp
-       leaq    8(%rax),%rax
-       movq    %rbp,160(%r8)
-       jmp     .Lcommon_seh_tail
+       movq    208(%r8),%rax
 
-.Lrestore_cbc_rax:
-       movq    120(%r8),%rax
+       movq    -8(%rax),%rbp
+       movq    %rbp,160(%r8)
 
 .Lcommon_seh_tail:
        movq    8(%rax),%rdi
@@ -4006,6 +5034,14 @@ cbc_se_handler:
 .rva   .LSEH_begin_aesni_xts_decrypt
 .rva   .LSEH_end_aesni_xts_decrypt
 .rva   .LSEH_info_xts_dec
+
+.rva   .LSEH_begin_aesni_ocb_encrypt
+.rva   .LSEH_end_aesni_ocb_encrypt
+.rva   .LSEH_info_ocb_enc
+
+.rva   .LSEH_begin_aesni_ocb_decrypt
+.rva   .LSEH_end_aesni_ocb_decrypt
+.rva   .LSEH_info_ocb_dec
 .rva   .LSEH_begin_aesni_cbc_encrypt
 .rva   .LSEH_end_aesni_cbc_encrypt
 .rva   .LSEH_info_cbc
@@ -4043,6 +5079,18 @@ cbc_se_handler:
 .byte  9,0,0,0
 .rva   ctr_xts_se_handler
 .rva   .Lxts_dec_body,.Lxts_dec_epilogue
+.LSEH_info_ocb_enc:
+.byte  9,0,0,0
+.rva   ocb_se_handler
+.rva   .Locb_enc_body,.Locb_enc_epilogue
+.rva   .Locb_enc_pop
+.long  0
+.LSEH_info_ocb_dec:
+.byte  9,0,0,0
+.rva   ocb_se_handler
+.rva   .Locb_dec_body,.Locb_dec_epilogue
+.rva   .Locb_dec_pop
+.long  0
 .LSEH_info_cbc:
 .byte  9,0,0,0
 .rva   cbc_se_handler
index 4baa8b0b45eba77608962a5d6cf799d4824c1b52..610e9617dbe85c46db9455c6f65386b3c0e04a2a 100644 (file)
@@ -21,7 +21,6 @@
 #
 # *** This file is auto-generated ***
 #
-.file  "devel/perlasm/cpuid-x86.s"
 .text
 .globl _gnutls_cpuid
 .def   _gnutls_cpuid;  .scl    2;      .type   32;     .endef
index f4bcee28f0f6ed570ef059a35d3759c26fcaf5a0..de207e4002f1924d1153bcfede354956b24172a0 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -51,9 +51,21 @@ gcm_gmult_4bit:
        movq    %rcx,%rdi
        movq    %rdx,%rsi
 
+
        pushq   %rbx
+
        pushq   %rbp
+
        pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       pushq   %r15
+
+       subq    $280,%rsp
+
 .Lgmult_prologue:
 
        movzbq  15(%rdi),%r8
@@ -130,12 +142,17 @@ gcm_gmult_4bit:
        movq    %r8,8(%rdi)
        movq    %r9,(%rdi)
 
-       movq    16(%rsp),%rbx
-       leaq    24(%rsp),%rsp
+       leaq    280+48(%rsp),%rsi
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
 .Lgmult_epilogue:
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
+
 .LSEH_end_gcm_gmult_4bit:
 .globl gcm_ghash_4bit
 .def   gcm_ghash_4bit; .scl 2; .type 32;       .endef
@@ -150,13 +167,21 @@ gcm_ghash_4bit:
        movq    %r8,%rdx
        movq    %r9,%rcx
 
+
        pushq   %rbx
+
        pushq   %rbp
+
        pushq   %r12
+
        pushq   %r13
+
        pushq   %r14
+
        pushq   %r15
+
        subq    $280,%rsp
+
 .Lghash_prologue:
        movq    %rdx,%r14
        movq    %rcx,%r15
@@ -701,23 +726,33 @@ gcm_ghash_4bit:
        movq    %r8,8(%rdi)
        movq    %r9,(%rdi)
 
-       leaq    280(%rsp),%rsi
-       movq    0(%rsi),%r15
-       movq    8(%rsi),%r14
-       movq    16(%rsi),%r13
-       movq    24(%rsi),%r12
-       movq    32(%rsi),%rbp
-       movq    40(%rsi),%rbx
-       leaq    48(%rsi),%rsp
+       leaq    280+48(%rsp),%rsi
+
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    0(%rsi),%rsp
+
 .Lghash_epilogue:
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
+
 .LSEH_end_gcm_ghash_4bit:
 .globl gcm_init_clmul
 .def   gcm_init_clmul; .scl 2; .type 32;       .endef
 .p2align       4
 gcm_init_clmul:
+
 .L_init_clmul:
 .LSEH_begin_gcm_init_clmul:
 
@@ -877,10 +912,12 @@ gcm_init_clmul:
 .LSEH_end_gcm_init_clmul:
        .byte   0xf3,0xc3
 
+
 .globl gcm_gmult_clmul
 .def   gcm_gmult_clmul;        .scl 2; .type 32;       .endef
 .p2align       4
 gcm_gmult_clmul:
+
 .L_gmult_clmul:
        movdqu  (%rcx),%xmm0
        movdqa  .Lbswap_mask(%rip),%xmm5
@@ -928,10 +965,12 @@ gcm_gmult_clmul:
        movdqu  %xmm0,(%rcx)
        .byte   0xf3,0xc3
 
+
 .globl gcm_ghash_clmul
 .def   gcm_ghash_clmul;        .scl 2; .type 32;       .endef
 .p2align       5
 gcm_ghash_clmul:
+
 .L_ghash_clmul:
        leaq    -136(%rsp),%rax
 .LSEH_begin_gcm_ghash_clmul:
@@ -1337,10 +1376,12 @@ gcm_ghash_clmul:
 .LSEH_end_gcm_ghash_clmul:
        .byte   0xf3,0xc3
 
+
 .globl gcm_init_avx
 .def   gcm_init_avx;   .scl 2; .type 32;       .endef
 .p2align       5
 gcm_init_avx:
+
 .LSEH_begin_gcm_init_avx:
 
 .byte  0x48,0x83,0xec,0x18
@@ -1451,16 +1492,20 @@ gcm_init_avx:
 .LSEH_end_gcm_init_avx:
        .byte   0xf3,0xc3
 
+
 .globl gcm_gmult_avx
 .def   gcm_gmult_avx;  .scl 2; .type 32;       .endef
 .p2align       5
 gcm_gmult_avx:
+
        jmp     .L_gmult_clmul
 
+
 .globl gcm_ghash_avx
 .def   gcm_ghash_avx;  .scl 2; .type 32;       .endef
 .p2align       5
 gcm_ghash_avx:
+
        leaq    -136(%rsp),%rax
 .LSEH_begin_gcm_ghash_avx:
 
@@ -1859,6 +1904,7 @@ gcm_ghash_avx:
 .LSEH_end_gcm_ghash_avx:
        .byte   0xf3,0xc3
 
+
 .p2align       6
 .Lbswap_mask:
 .byte  15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
@@ -1945,14 +1991,20 @@ se_handler:
        cmpq    %r10,%rbx
        jae     .Lin_prologue
 
-       leaq    24(%rax),%rax
+       leaq    48+280(%rax),%rax
 
        movq    -8(%rax),%rbx
        movq    -16(%rax),%rbp
        movq    -24(%rax),%r12
+       movq    -32(%rax),%r13
+       movq    -40(%rax),%r14
+       movq    -48(%rax),%r15
        movq    %rbx,144(%r8)
        movq    %rbp,160(%r8)
        movq    %r12,216(%r8)
+       movq    %r13,224(%r8)
+       movq    %r14,232(%r8)
+       movq    %r15,240(%r8)
 
 .Lin_prologue:
        movq    8(%rax),%rdi
index 22c17e73531d18e53823f0878188296f41231cea..30f9ded212a7874ec4ba7dfbdb482ac647e35188 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,6 @@
 #
 # *** This file is auto-generated ***
 #
-.file  "sha1-586.s"
 .text
 .globl _sha1_block_data_order
 .def   _sha1_block_data_order; .scl    2;      .type   32;     .endef
index 13203c2b909df51ff11b1bf9803f9bfd2a291308..cdfc88254ef0a6f92ff5f60727c7e51ae4f23e5b 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -52,25 +52,45 @@ sha1_block_data_order:
        movq    %rdx,%rsi
        movq    %r8,%rdx
 
+
        movl    _gnutls_x86_cpuid_s+0(%rip),%r9d
        movl    _gnutls_x86_cpuid_s+4(%rip),%r8d
+       movl    _gnutls_x86_cpuid_s+8(%rip),%r10d
        testl   $512,%r8d
        jz      .Lialu
+       testl   $536870912,%r10d
+       jnz     _shaext_shortcut
+       andl    $296,%r10d
+       cmpl    $296,%r10d
+       je      _avx2_shortcut
+       andl    $268435456,%r8d
+       andl    $1073741824,%r9d
+       orl     %r9d,%r8d
+       cmpl    $1342177280,%r8d
+       je      _avx_shortcut
        jmp     _ssse3_shortcut
 
 .p2align       4
 .Lialu:
+       movq    %rsp,%rax
+
        pushq   %rbx
+
        pushq   %rbp
+
        pushq   %r12
+
        pushq   %r13
-       movq    %rsp,%r11
+
+       pushq   %r14
+
        movq    %rdi,%r8
        subq    $72,%rsp
        movq    %rsi,%r9
        andq    $-64,%rsp
        movq    %rdx,%r10
-       movq    %r11,64(%rsp)
+       movq    %rax,64(%rsp)
+
 .Lprologue:
 
        movl    0(%r8),%esi
@@ -84,1230 +104,1168 @@ sha1_block_data_order:
 .Lloop:
        movl    0(%r9),%edx
        bswapl  %edx
-       movl    %edx,0(%rsp)
-       movl    %r11d,%eax
        movl    4(%r9),%ebp
+       movl    %r12d,%eax
+       movl    %edx,0(%rsp)
        movl    %esi,%ecx
-       xorl    %r12d,%eax
        bswapl  %ebp
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%r13,1),%r13d
        andl    %edi,%eax
-       movl    %ebp,4(%rsp)
+       leal    1518500249(%rdx,%r13,1),%r13d
        addl    %ecx,%r13d
        xorl    %r12d,%eax
        roll    $30,%edi
        addl    %eax,%r13d
-       movl    %edi,%eax
-       movl    8(%r9),%edx
+       movl    8(%r9),%r14d
+       movl    %r11d,%eax
+       movl    %ebp,4(%rsp)
        movl    %r13d,%ecx
-       xorl    %r11d,%eax
-       bswapl  %edx
+       bswapl  %r14d
+       xorl    %edi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%r12,1),%r12d
        andl    %esi,%eax
-       movl    %edx,8(%rsp)
+       leal    1518500249(%rbp,%r12,1),%r12d
        addl    %ecx,%r12d
        xorl    %r11d,%eax
        roll    $30,%esi
        addl    %eax,%r12d
-       movl    %esi,%eax
-       movl    12(%r9),%ebp
+       movl    12(%r9),%edx
+       movl    %edi,%eax
+       movl    %r14d,8(%rsp)
        movl    %r12d,%ecx
-       xorl    %edi,%eax
-       bswapl  %ebp
+       bswapl  %edx
+       xorl    %esi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%r11,1),%r11d
        andl    %r13d,%eax
-       movl    %ebp,12(%rsp)
+       leal    1518500249(%r14,%r11,1),%r11d
        addl    %ecx,%r11d
        xorl    %edi,%eax
        roll    $30,%r13d
        addl    %eax,%r11d
-       movl    %r13d,%eax
-       movl    16(%r9),%edx
+       movl    16(%r9),%ebp
+       movl    %esi,%eax
+       movl    %edx,12(%rsp)
        movl    %r11d,%ecx
-       xorl    %esi,%eax
-       bswapl  %edx
+       bswapl  %ebp
+       xorl    %r13d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%rdi,1),%edi
        andl    %r12d,%eax
-       movl    %edx,16(%rsp)
+       leal    1518500249(%rdx,%rdi,1),%edi
        addl    %ecx,%edi
        xorl    %esi,%eax
        roll    $30,%r12d
        addl    %eax,%edi
-       movl    %r12d,%eax
-       movl    20(%r9),%ebp
+       movl    20(%r9),%r14d
+       movl    %r13d,%eax
+       movl    %ebp,16(%rsp)
        movl    %edi,%ecx
-       xorl    %r13d,%eax
-       bswapl  %ebp
+       bswapl  %r14d
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%rsi,1),%esi
        andl    %r11d,%eax
-       movl    %ebp,20(%rsp)
+       leal    1518500249(%rbp,%rsi,1),%esi
        addl    %ecx,%esi
        xorl    %r13d,%eax
        roll    $30,%r11d
        addl    %eax,%esi
-       movl    %r11d,%eax
        movl    24(%r9),%edx
+       movl    %r12d,%eax
+       movl    %r14d,20(%rsp)
        movl    %esi,%ecx
-       xorl    %r12d,%eax
        bswapl  %edx
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%r13,1),%r13d
        andl    %edi,%eax
-       movl    %edx,24(%rsp)
+       leal    1518500249(%r14,%r13,1),%r13d
        addl    %ecx,%r13d
        xorl    %r12d,%eax
        roll    $30,%edi
        addl    %eax,%r13d
-       movl    %edi,%eax
        movl    28(%r9),%ebp
+       movl    %r11d,%eax
+       movl    %edx,24(%rsp)
        movl    %r13d,%ecx
-       xorl    %r11d,%eax
        bswapl  %ebp
+       xorl    %edi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%r12,1),%r12d
        andl    %esi,%eax
-       movl    %ebp,28(%rsp)
+       leal    1518500249(%rdx,%r12,1),%r12d
        addl    %ecx,%r12d
        xorl    %r11d,%eax
        roll    $30,%esi
        addl    %eax,%r12d
-       movl    %esi,%eax
-       movl    32(%r9),%edx
+       movl    32(%r9),%r14d
+       movl    %edi,%eax
+       movl    %ebp,28(%rsp)
        movl    %r12d,%ecx
-       xorl    %edi,%eax
-       bswapl  %edx
+       bswapl  %r14d
+       xorl    %esi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%r11,1),%r11d
        andl    %r13d,%eax
-       movl    %edx,32(%rsp)
+       leal    1518500249(%rbp,%r11,1),%r11d
        addl    %ecx,%r11d
        xorl    %edi,%eax
        roll    $30,%r13d
        addl    %eax,%r11d
-       movl    %r13d,%eax
-       movl    36(%r9),%ebp
+       movl    36(%r9),%edx
+       movl    %esi,%eax
+       movl    %r14d,32(%rsp)
        movl    %r11d,%ecx
-       xorl    %esi,%eax
-       bswapl  %ebp
+       bswapl  %edx
+       xorl    %r13d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%rdi,1),%edi
        andl    %r12d,%eax
-       movl    %ebp,36(%rsp)
+       leal    1518500249(%r14,%rdi,1),%edi
        addl    %ecx,%edi
        xorl    %esi,%eax
        roll    $30,%r12d
        addl    %eax,%edi
-       movl    %r12d,%eax
-       movl    40(%r9),%edx
+       movl    40(%r9),%ebp
+       movl    %r13d,%eax
+       movl    %edx,36(%rsp)
        movl    %edi,%ecx
-       xorl    %r13d,%eax
-       bswapl  %edx
+       bswapl  %ebp
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%rsi,1),%esi
        andl    %r11d,%eax
-       movl    %edx,40(%rsp)
+       leal    1518500249(%rdx,%rsi,1),%esi
        addl    %ecx,%esi
        xorl    %r13d,%eax
        roll    $30,%r11d
        addl    %eax,%esi
-       movl    %r11d,%eax
-       movl    44(%r9),%ebp
+       movl    44(%r9),%r14d
+       movl    %r12d,%eax
+       movl    %ebp,40(%rsp)
        movl    %esi,%ecx
-       xorl    %r12d,%eax
-       bswapl  %ebp
+       bswapl  %r14d
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%r13,1),%r13d
        andl    %edi,%eax
-       movl    %ebp,44(%rsp)
+       leal    1518500249(%rbp,%r13,1),%r13d
        addl    %ecx,%r13d
        xorl    %r12d,%eax
        roll    $30,%edi
        addl    %eax,%r13d
-       movl    %edi,%eax
        movl    48(%r9),%edx
+       movl    %r11d,%eax
+       movl    %r14d,44(%rsp)
        movl    %r13d,%ecx
-       xorl    %r11d,%eax
        bswapl  %edx
+       xorl    %edi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%r12,1),%r12d
        andl    %esi,%eax
-       movl    %edx,48(%rsp)
+       leal    1518500249(%r14,%r12,1),%r12d
        addl    %ecx,%r12d
        xorl    %r11d,%eax
        roll    $30,%esi
        addl    %eax,%r12d
-       movl    %esi,%eax
        movl    52(%r9),%ebp
+       movl    %edi,%eax
+       movl    %edx,48(%rsp)
        movl    %r12d,%ecx
-       xorl    %edi,%eax
        bswapl  %ebp
+       xorl    %esi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%r11,1),%r11d
        andl    %r13d,%eax
-       movl    %ebp,52(%rsp)
+       leal    1518500249(%rdx,%r11,1),%r11d
        addl    %ecx,%r11d
        xorl    %edi,%eax
        roll    $30,%r13d
        addl    %eax,%r11d
-       movl    %r13d,%eax
-       movl    56(%r9),%edx
+       movl    56(%r9),%r14d
+       movl    %esi,%eax
+       movl    %ebp,52(%rsp)
        movl    %r11d,%ecx
-       xorl    %esi,%eax
-       bswapl  %edx
+       bswapl  %r14d
+       xorl    %r13d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%rdi,1),%edi
        andl    %r12d,%eax
-       movl    %edx,56(%rsp)
+       leal    1518500249(%rbp,%rdi,1),%edi
        addl    %ecx,%edi
        xorl    %esi,%eax
        roll    $30,%r12d
        addl    %eax,%edi
-       movl    %r12d,%eax
-       movl    60(%r9),%ebp
+       movl    60(%r9),%edx
+       movl    %r13d,%eax
+       movl    %r14d,56(%rsp)
        movl    %edi,%ecx
-       xorl    %r13d,%eax
-       bswapl  %ebp
+       bswapl  %edx
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%rsi,1),%esi
        andl    %r11d,%eax
-       movl    %ebp,60(%rsp)
+       leal    1518500249(%r14,%rsi,1),%esi
        addl    %ecx,%esi
        xorl    %r13d,%eax
        roll    $30,%r11d
        addl    %eax,%esi
-       movl    0(%rsp),%edx
-       movl    %r11d,%eax
+       xorl    0(%rsp),%ebp
+       movl    %r12d,%eax
+       movl    %edx,60(%rsp)
        movl    %esi,%ecx
-       xorl    8(%rsp),%edx
-       xorl    %r12d,%eax
+       xorl    8(%rsp),%ebp
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       xorl    32(%rsp),%edx
+       xorl    32(%rsp),%ebp
        andl    %edi,%eax
-       leal    1518500249(%rbp,%r13,1),%r13d
-       xorl    52(%rsp),%edx
+       leal    1518500249(%rdx,%r13,1),%r13d
+       roll    $30,%edi
        xorl    %r12d,%eax
-       roll    $1,%edx
        addl    %ecx,%r13d
-       roll    $30,%edi
-       movl    %edx,0(%rsp)
+       roll    $1,%ebp
        addl    %eax,%r13d
-       movl    4(%rsp),%ebp
-       movl    %edi,%eax
+       xorl    4(%rsp),%r14d
+       movl    %r11d,%eax
+       movl    %ebp,0(%rsp)
        movl    %r13d,%ecx
-       xorl    12(%rsp),%ebp
-       xorl    %r11d,%eax
+       xorl    12(%rsp),%r14d
+       xorl    %edi,%eax
        roll    $5,%ecx
-       xorl    36(%rsp),%ebp
+       xorl    36(%rsp),%r14d
        andl    %esi,%eax
-       leal    1518500249(%rdx,%r12,1),%r12d
-       xorl    56(%rsp),%ebp
+       leal    1518500249(%rbp,%r12,1),%r12d
+       roll    $30,%esi
        xorl    %r11d,%eax
-       roll    $1,%ebp
        addl    %ecx,%r12d
-       roll    $30,%esi
-       movl    %ebp,4(%rsp)
+       roll    $1,%r14d
        addl    %eax,%r12d
-       movl    8(%rsp),%edx
-       movl    %esi,%eax
+       xorl    8(%rsp),%edx
+       movl    %edi,%eax
+       movl    %r14d,4(%rsp)
        movl    %r12d,%ecx
        xorl    16(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    %esi,%eax
        roll    $5,%ecx
        xorl    40(%rsp),%edx
        andl    %r13d,%eax
-       leal    1518500249(%rbp,%r11,1),%r11d
-       xorl    60(%rsp),%edx
+       leal    1518500249(%r14,%r11,1),%r11d
+       roll    $30,%r13d
        xorl    %edi,%eax
-       roll    $1,%edx
        addl    %ecx,%r11d
-       roll    $30,%r13d
-       movl    %edx,8(%rsp)
+       roll    $1,%edx
        addl    %eax,%r11d
-       movl    12(%rsp),%ebp
-       movl    %r13d,%eax
+       xorl    12(%rsp),%ebp
+       movl    %esi,%eax
+       movl    %edx,8(%rsp)
        movl    %r11d,%ecx
        xorl    20(%rsp),%ebp
-       xorl    %esi,%eax
+       xorl    %r13d,%eax
        roll    $5,%ecx
        xorl    44(%rsp),%ebp
        andl    %r12d,%eax
        leal    1518500249(%rdx,%rdi,1),%edi
-       xorl    0(%rsp),%ebp
+       roll    $30,%r12d
        xorl    %esi,%eax
-       roll    $1,%ebp
        addl    %ecx,%edi
-       roll    $30,%r12d
-       movl    %ebp,12(%rsp)
+       roll    $1,%ebp
        addl    %eax,%edi
-       movl    16(%rsp),%edx
-       movl    %r12d,%eax
+       xorl    16(%rsp),%r14d
+       movl    %r13d,%eax
+       movl    %ebp,12(%rsp)
        movl    %edi,%ecx
-       xorl    24(%rsp),%edx
-       xorl    %r13d,%eax
+       xorl    24(%rsp),%r14d
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       xorl    48(%rsp),%edx
+       xorl    48(%rsp),%r14d
        andl    %r11d,%eax
        leal    1518500249(%rbp,%rsi,1),%esi
-       xorl    4(%rsp),%edx
+       roll    $30,%r11d
        xorl    %r13d,%eax
-       roll    $1,%edx
        addl    %ecx,%esi
-       roll    $30,%r11d
-       movl    %edx,16(%rsp)
+       roll    $1,%r14d
        addl    %eax,%esi
-       movl    20(%rsp),%ebp
-       movl    %r11d,%eax
+       xorl    20(%rsp),%edx
+       movl    %edi,%eax
+       movl    %r14d,16(%rsp)
        movl    %esi,%ecx
-       xorl    28(%rsp),%ebp
-       xorl    %edi,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%r13,1),%r13d
-       xorl    52(%rsp),%ebp
+       xorl    28(%rsp),%edx
        xorl    %r12d,%eax
+       roll    $5,%ecx
+       xorl    52(%rsp),%edx
+       leal    1859775393(%r14,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    8(%rsp),%ebp
        roll    $30,%edi
        addl    %eax,%r13d
-       roll    $1,%ebp
-       movl    %ebp,20(%rsp)
-       movl    24(%rsp),%edx
-       movl    %edi,%eax
+       roll    $1,%edx
+       xorl    24(%rsp),%ebp
+       movl    %esi,%eax
+       movl    %edx,20(%rsp)
        movl    %r13d,%ecx
-       xorl    32(%rsp),%edx
-       xorl    %esi,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rbp,%r12,1),%r12d
-       xorl    56(%rsp),%edx
+       xorl    32(%rsp),%ebp
        xorl    %r11d,%eax
+       roll    $5,%ecx
+       xorl    56(%rsp),%ebp
+       leal    1859775393(%rdx,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    12(%rsp),%edx
        roll    $30,%esi
        addl    %eax,%r12d
-       roll    $1,%edx
-       movl    %edx,24(%rsp)
-       movl    28(%rsp),%ebp
-       movl    %esi,%eax
+       roll    $1,%ebp
+       xorl    28(%rsp),%r14d
+       movl    %r13d,%eax
+       movl    %ebp,24(%rsp)
        movl    %r12d,%ecx
-       xorl    36(%rsp),%ebp
-       xorl    %r13d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%r11,1),%r11d
-       xorl    60(%rsp),%ebp
+       xorl    36(%rsp),%r14d
        xorl    %edi,%eax
+       roll    $5,%ecx
+       xorl    60(%rsp),%r14d
+       leal    1859775393(%rbp,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    16(%rsp),%ebp
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%ebp
-       movl    %ebp,28(%rsp)
-       movl    32(%rsp),%edx
-       movl    %r13d,%eax
+       roll    $1,%r14d
+       xorl    32(%rsp),%edx
+       movl    %r12d,%eax
+       movl    %r14d,28(%rsp)
        movl    %r11d,%ecx
        xorl    40(%rsp),%edx
-       xorl    %r12d,%eax
+       xorl    %esi,%eax
        roll    $5,%ecx
-       leal    1859775393(%rbp,%rdi,1),%edi
        xorl    0(%rsp),%edx
-       xorl    %esi,%eax
+       leal    1859775393(%r14,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    20(%rsp),%edx
        roll    $30,%r12d
        addl    %eax,%edi
        roll    $1,%edx
+       xorl    36(%rsp),%ebp
+       movl    %r11d,%eax
        movl    %edx,32(%rsp)
-       movl    36(%rsp),%ebp
-       movl    %r12d,%eax
        movl    %edi,%ecx
        xorl    44(%rsp),%ebp
-       xorl    %r11d,%eax
+       xorl    %r13d,%eax
        roll    $5,%ecx
-       leal    1859775393(%rdx,%rsi,1),%esi
        xorl    4(%rsp),%ebp
-       xorl    %r13d,%eax
+       leal    1859775393(%rdx,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    24(%rsp),%ebp
        roll    $30,%r11d
        addl    %eax,%esi
        roll    $1,%ebp
+       xorl    40(%rsp),%r14d
+       movl    %edi,%eax
        movl    %ebp,36(%rsp)
-       movl    40(%rsp),%edx
-       movl    %r11d,%eax
        movl    %esi,%ecx
-       xorl    48(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    48(%rsp),%r14d
+       xorl    %r12d,%eax
        roll    $5,%ecx
+       xorl    8(%rsp),%r14d
        leal    1859775393(%rbp,%r13,1),%r13d
-       xorl    8(%rsp),%edx
-       xorl    %r12d,%eax
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    28(%rsp),%edx
        roll    $30,%edi
        addl    %eax,%r13d
-       roll    $1,%edx
-       movl    %edx,40(%rsp)
-       movl    44(%rsp),%ebp
-       movl    %edi,%eax
+       roll    $1,%r14d
+       xorl    44(%rsp),%edx
+       movl    %esi,%eax
+       movl    %r14d,40(%rsp)
        movl    %r13d,%ecx
-       xorl    52(%rsp),%ebp
-       xorl    %esi,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%r12,1),%r12d
-       xorl    12(%rsp),%ebp
+       xorl    52(%rsp),%edx
        xorl    %r11d,%eax
+       roll    $5,%ecx
+       xorl    12(%rsp),%edx
+       leal    1859775393(%r14,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    32(%rsp),%ebp
        roll    $30,%esi
        addl    %eax,%r12d
-       roll    $1,%ebp
-       movl    %ebp,44(%rsp)
-       movl    48(%rsp),%edx
-       movl    %esi,%eax
+       roll    $1,%edx
+       xorl    48(%rsp),%ebp
+       movl    %r13d,%eax
+       movl    %edx,44(%rsp)
        movl    %r12d,%ecx
-       xorl    56(%rsp),%edx
-       xorl    %r13d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rbp,%r11,1),%r11d
-       xorl    16(%rsp),%edx
+       xorl    56(%rsp),%ebp
        xorl    %edi,%eax
+       roll    $5,%ecx
+       xorl    16(%rsp),%ebp
+       leal    1859775393(%rdx,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    36(%rsp),%edx
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%edx
-       movl    %edx,48(%rsp)
-       movl    52(%rsp),%ebp
-       movl    %r13d,%eax
+       roll    $1,%ebp
+       xorl    52(%rsp),%r14d
+       movl    %r12d,%eax
+       movl    %ebp,48(%rsp)
        movl    %r11d,%ecx
-       xorl    60(%rsp),%ebp
-       xorl    %r12d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%rdi,1),%edi
-       xorl    20(%rsp),%ebp
+       xorl    60(%rsp),%r14d
        xorl    %esi,%eax
+       roll    $5,%ecx
+       xorl    20(%rsp),%r14d
+       leal    1859775393(%rbp,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    40(%rsp),%ebp
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%ebp
-       movl    %ebp,52(%rsp)
-       movl    56(%rsp),%edx
-       movl    %r12d,%eax
+       roll    $1,%r14d
+       xorl    56(%rsp),%edx
+       movl    %r11d,%eax
+       movl    %r14d,52(%rsp)
        movl    %edi,%ecx
        xorl    0(%rsp),%edx
-       xorl    %r11d,%eax
+       xorl    %r13d,%eax
        roll    $5,%ecx
-       leal    1859775393(%rbp,%rsi,1),%esi
        xorl    24(%rsp),%edx
-       xorl    %r13d,%eax
+       leal    1859775393(%r14,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    44(%rsp),%edx
        roll    $30,%r11d
        addl    %eax,%esi
        roll    $1,%edx
+       xorl    60(%rsp),%ebp
+       movl    %edi,%eax
        movl    %edx,56(%rsp)
-       movl    60(%rsp),%ebp
-       movl    %r11d,%eax
        movl    %esi,%ecx
        xorl    4(%rsp),%ebp
-       xorl    %edi,%eax
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    1859775393(%rdx,%r13,1),%r13d
        xorl    28(%rsp),%ebp
-       xorl    %r12d,%eax
+       leal    1859775393(%rdx,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    48(%rsp),%ebp
        roll    $30,%edi
        addl    %eax,%r13d
        roll    $1,%ebp
+       xorl    0(%rsp),%r14d
+       movl    %esi,%eax
        movl    %ebp,60(%rsp)
-       movl    0(%rsp),%edx
-       movl    %edi,%eax
        movl    %r13d,%ecx
-       xorl    8(%rsp),%edx
-       xorl    %esi,%eax
+       xorl    8(%rsp),%r14d
+       xorl    %r11d,%eax
        roll    $5,%ecx
+       xorl    32(%rsp),%r14d
        leal    1859775393(%rbp,%r12,1),%r12d
-       xorl    32(%rsp),%edx
-       xorl    %r11d,%eax
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    52(%rsp),%edx
        roll    $30,%esi
        addl    %eax,%r12d
-       roll    $1,%edx
-       movl    %edx,0(%rsp)
-       movl    4(%rsp),%ebp
-       movl    %esi,%eax
+       roll    $1,%r14d
+       xorl    4(%rsp),%edx
+       movl    %r13d,%eax
+       movl    %r14d,0(%rsp)
        movl    %r12d,%ecx
-       xorl    12(%rsp),%ebp
-       xorl    %r13d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%r11,1),%r11d
-       xorl    36(%rsp),%ebp
+       xorl    12(%rsp),%edx
        xorl    %edi,%eax
+       roll    $5,%ecx
+       xorl    36(%rsp),%edx
+       leal    1859775393(%r14,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    56(%rsp),%ebp
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%ebp
-       movl    %ebp,4(%rsp)
-       movl    8(%rsp),%edx
-       movl    %r13d,%eax
+       roll    $1,%edx
+       xorl    8(%rsp),%ebp
+       movl    %r12d,%eax
+       movl    %edx,4(%rsp)
        movl    %r11d,%ecx
-       xorl    16(%rsp),%edx
-       xorl    %r12d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rbp,%rdi,1),%edi
-       xorl    40(%rsp),%edx
+       xorl    16(%rsp),%ebp
        xorl    %esi,%eax
+       roll    $5,%ecx
+       xorl    40(%rsp),%ebp
+       leal    1859775393(%rdx,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    60(%rsp),%edx
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%edx
-       movl    %edx,8(%rsp)
-       movl    12(%rsp),%ebp
-       movl    %r12d,%eax
+       roll    $1,%ebp
+       xorl    12(%rsp),%r14d
+       movl    %r11d,%eax
+       movl    %ebp,8(%rsp)
        movl    %edi,%ecx
-       xorl    20(%rsp),%ebp
-       xorl    %r11d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%rsi,1),%esi
-       xorl    44(%rsp),%ebp
+       xorl    20(%rsp),%r14d
        xorl    %r13d,%eax
+       roll    $5,%ecx
+       xorl    44(%rsp),%r14d
+       leal    1859775393(%rbp,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    0(%rsp),%ebp
        roll    $30,%r11d
        addl    %eax,%esi
-       roll    $1,%ebp
-       movl    %ebp,12(%rsp)
-       movl    16(%rsp),%edx
-       movl    %r11d,%eax
+       roll    $1,%r14d
+       xorl    16(%rsp),%edx
+       movl    %edi,%eax
+       movl    %r14d,12(%rsp)
        movl    %esi,%ecx
        xorl    24(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    1859775393(%rbp,%r13,1),%r13d
        xorl    48(%rsp),%edx
-       xorl    %r12d,%eax
+       leal    1859775393(%r14,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    4(%rsp),%edx
        roll    $30,%edi
        addl    %eax,%r13d
        roll    $1,%edx
+       xorl    20(%rsp),%ebp
+       movl    %esi,%eax
        movl    %edx,16(%rsp)
-       movl    20(%rsp),%ebp
-       movl    %edi,%eax
        movl    %r13d,%ecx
        xorl    28(%rsp),%ebp
-       xorl    %esi,%eax
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    1859775393(%rdx,%r12,1),%r12d
        xorl    52(%rsp),%ebp
-       xorl    %r11d,%eax
+       leal    1859775393(%rdx,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    8(%rsp),%ebp
        roll    $30,%esi
        addl    %eax,%r12d
        roll    $1,%ebp
+       xorl    24(%rsp),%r14d
+       movl    %r13d,%eax
        movl    %ebp,20(%rsp)
-       movl    24(%rsp),%edx
-       movl    %esi,%eax
        movl    %r12d,%ecx
-       xorl    32(%rsp),%edx
-       xorl    %r13d,%eax
+       xorl    32(%rsp),%r14d
+       xorl    %edi,%eax
        roll    $5,%ecx
+       xorl    56(%rsp),%r14d
        leal    1859775393(%rbp,%r11,1),%r11d
-       xorl    56(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    12(%rsp),%edx
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%edx
-       movl    %edx,24(%rsp)
-       movl    28(%rsp),%ebp
-       movl    %r13d,%eax
+       roll    $1,%r14d
+       xorl    28(%rsp),%edx
+       movl    %r12d,%eax
+       movl    %r14d,24(%rsp)
        movl    %r11d,%ecx
-       xorl    36(%rsp),%ebp
-       xorl    %r12d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%rdi,1),%edi
-       xorl    60(%rsp),%ebp
+       xorl    36(%rsp),%edx
        xorl    %esi,%eax
+       roll    $5,%ecx
+       xorl    60(%rsp),%edx
+       leal    1859775393(%r14,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    16(%rsp),%ebp
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%ebp
-       movl    %ebp,28(%rsp)
-       movl    32(%rsp),%edx
-       movl    %r12d,%eax
+       roll    $1,%edx
+       xorl    32(%rsp),%ebp
+       movl    %r11d,%eax
+       movl    %edx,28(%rsp)
        movl    %edi,%ecx
-       xorl    40(%rsp),%edx
-       xorl    %r11d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rbp,%rsi,1),%esi
-       xorl    0(%rsp),%edx
+       xorl    40(%rsp),%ebp
        xorl    %r13d,%eax
+       roll    $5,%ecx
+       xorl    0(%rsp),%ebp
+       leal    1859775393(%rdx,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    20(%rsp),%edx
        roll    $30,%r11d
        addl    %eax,%esi
-       roll    $1,%edx
-       movl    %edx,32(%rsp)
-       movl    36(%rsp),%ebp
-       movl    %r11d,%eax
-       movl    %r11d,%ebx
-       xorl    44(%rsp),%ebp
-       andl    %r12d,%eax
+       roll    $1,%ebp
+       xorl    36(%rsp),%r14d
+       movl    %r12d,%eax
+       movl    %ebp,32(%rsp)
+       movl    %r12d,%ebx
+       xorl    44(%rsp),%r14d
+       andl    %r11d,%eax
        movl    %esi,%ecx
-       xorl    4(%rsp),%ebp
-       xorl    %r12d,%ebx
-       leal    -1894007588(%rdx,%r13,1),%r13d
+       xorl    4(%rsp),%r14d
+       leal    -1894007588(%rbp,%r13,1),%r13d
+       xorl    %r11d,%ebx
        roll    $5,%ecx
-       xorl    24(%rsp),%ebp
        addl    %eax,%r13d
+       roll    $1,%r14d
        andl    %edi,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%r13d
-       roll    $30,%edi
-       movl    %ebp,36(%rsp)
        addl    %ecx,%r13d
-       movl    40(%rsp),%edx
-       movl    %edi,%eax
-       movl    %edi,%ebx
+       roll    $30,%edi
+       addl    %ebx,%r13d
+       xorl    40(%rsp),%edx
+       movl    %r11d,%eax
+       movl    %r14d,36(%rsp)
+       movl    %r11d,%ebx
        xorl    48(%rsp),%edx
-       andl    %r11d,%eax
+       andl    %edi,%eax
        movl    %r13d,%ecx
        xorl    8(%rsp),%edx
-       xorl    %r11d,%ebx
-       leal    -1894007588(%rbp,%r12,1),%r12d
+       leal    -1894007588(%r14,%r12,1),%r12d
+       xorl    %edi,%ebx
        roll    $5,%ecx
-       xorl    28(%rsp),%edx
        addl    %eax,%r12d
-       andl    %esi,%ebx
        roll    $1,%edx
-       addl    %ebx,%r12d
+       andl    %esi,%ebx
+       addl    %ecx,%r12d
        roll    $30,%esi
+       addl    %ebx,%r12d
+       xorl    44(%rsp),%ebp
+       movl    %edi,%eax
        movl    %edx,40(%rsp)
-       addl    %ecx,%r12d
-       movl    44(%rsp),%ebp
-       movl    %esi,%eax
-       movl    %esi,%ebx
+       movl    %edi,%ebx
        xorl    52(%rsp),%ebp
-       andl    %edi,%eax
+       andl    %esi,%eax
        movl    %r12d,%ecx
        xorl    12(%rsp),%ebp
-       xorl    %edi,%ebx
        leal    -1894007588(%rdx,%r11,1),%r11d
+       xorl    %esi,%ebx
        roll    $5,%ecx
-       xorl    32(%rsp),%ebp
        addl    %eax,%r11d
-       andl    %r13d,%ebx
        roll    $1,%ebp
-       addl    %ebx,%r11d
+       andl    %r13d,%ebx
+       addl    %ecx,%r11d
        roll    $30,%r13d
+       addl    %ebx,%r11d
+       xorl    48(%rsp),%r14d
+       movl    %esi,%eax
        movl    %ebp,44(%rsp)
-       addl    %ecx,%r11d
-       movl    48(%rsp),%edx
-       movl    %r13d,%eax
-       movl    %r13d,%ebx
-       xorl    56(%rsp),%edx
-       andl    %esi,%eax
+       movl    %esi,%ebx
+       xorl    56(%rsp),%r14d
+       andl    %r13d,%eax
        movl    %r11d,%ecx
-       xorl    16(%rsp),%edx
-       xorl    %esi,%ebx
+       xorl    16(%rsp),%r14d
        leal    -1894007588(%rbp,%rdi,1),%edi
+       xorl    %r13d,%ebx
        roll    $5,%ecx
-       xorl    36(%rsp),%edx
        addl    %eax,%edi
+       roll    $1,%r14d
        andl    %r12d,%ebx
-       roll    $1,%edx
-       addl    %ebx,%edi
-       roll    $30,%r12d
-       movl    %edx,48(%rsp)
        addl    %ecx,%edi
-       movl    52(%rsp),%ebp
-       movl    %r12d,%eax
-       movl    %r12d,%ebx
-       xorl    60(%rsp),%ebp
-       andl    %r13d,%eax
+       roll    $30,%r12d
+       addl    %ebx,%edi
+       xorl    52(%rsp),%edx
+       movl    %r13d,%eax
+       movl    %r14d,48(%rsp)
+       movl    %r13d,%ebx
+       xorl    60(%rsp),%edx
+       andl    %r12d,%eax
        movl    %edi,%ecx
-       xorl    20(%rsp),%ebp
-       xorl    %r13d,%ebx
-       leal    -1894007588(%rdx,%rsi,1),%esi
+       xorl    20(%rsp),%edx
+       leal    -1894007588(%r14,%rsi,1),%esi
+       xorl    %r12d,%ebx
        roll    $5,%ecx
-       xorl    40(%rsp),%ebp
        addl    %eax,%esi
+       roll    $1,%edx
        andl    %r11d,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%esi
-       roll    $30,%r11d
-       movl    %ebp,52(%rsp)
        addl    %ecx,%esi
-       movl    56(%rsp),%edx
-       movl    %r11d,%eax
-       movl    %r11d,%ebx
-       xorl    0(%rsp),%edx
-       andl    %r12d,%eax
+       roll    $30,%r11d
+       addl    %ebx,%esi
+       xorl    56(%rsp),%ebp
+       movl    %r12d,%eax
+       movl    %edx,52(%rsp)
+       movl    %r12d,%ebx
+       xorl    0(%rsp),%ebp
+       andl    %r11d,%eax
        movl    %esi,%ecx
-       xorl    24(%rsp),%edx
-       xorl    %r12d,%ebx
-       leal    -1894007588(%rbp,%r13,1),%r13d
+       xorl    24(%rsp),%ebp
+       leal    -1894007588(%rdx,%r13,1),%r13d
+       xorl    %r11d,%ebx
        roll    $5,%ecx
-       xorl    44(%rsp),%edx
        addl    %eax,%r13d
+       roll    $1,%ebp
        andl    %edi,%ebx
-       roll    $1,%edx
-       addl    %ebx,%r13d
-       roll    $30,%edi
-       movl    %edx,56(%rsp)
        addl    %ecx,%r13d
-       movl    60(%rsp),%ebp
-       movl    %edi,%eax
-       movl    %edi,%ebx
-       xorl    4(%rsp),%ebp
-       andl    %r11d,%eax
+       roll    $30,%edi
+       addl    %ebx,%r13d
+       xorl    60(%rsp),%r14d
+       movl    %r11d,%eax
+       movl    %ebp,56(%rsp)
+       movl    %r11d,%ebx
+       xorl    4(%rsp),%r14d
+       andl    %edi,%eax
        movl    %r13d,%ecx
-       xorl    28(%rsp),%ebp
-       xorl    %r11d,%ebx
-       leal    -1894007588(%rdx,%r12,1),%r12d
+       xorl    28(%rsp),%r14d
+       leal    -1894007588(%rbp,%r12,1),%r12d
+       xorl    %edi,%ebx
        roll    $5,%ecx
-       xorl    48(%rsp),%ebp
        addl    %eax,%r12d
+       roll    $1,%r14d
        andl    %esi,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%r12d
-       roll    $30,%esi
-       movl    %ebp,60(%rsp)
        addl    %ecx,%r12d
-       movl    0(%rsp),%edx
-       movl    %esi,%eax
-       movl    %esi,%ebx
+       roll    $30,%esi
+       addl    %ebx,%r12d
+       xorl    0(%rsp),%edx
+       movl    %edi,%eax
+       movl    %r14d,60(%rsp)
+       movl    %edi,%ebx
        xorl    8(%rsp),%edx
-       andl    %edi,%eax
+       andl    %esi,%eax
        movl    %r12d,%ecx
        xorl    32(%rsp),%edx
-       xorl    %edi,%ebx
-       leal    -1894007588(%rbp,%r11,1),%r11d
+       leal    -1894007588(%r14,%r11,1),%r11d
+       xorl    %esi,%ebx
        roll    $5,%ecx
-       xorl    52(%rsp),%edx
        addl    %eax,%r11d
-       andl    %r13d,%ebx
        roll    $1,%edx
-       addl    %ebx,%r11d
+       andl    %r13d,%ebx
+       addl    %ecx,%r11d
        roll    $30,%r13d
+       addl    %ebx,%r11d
+       xorl    4(%rsp),%ebp
+       movl    %esi,%eax
        movl    %edx,0(%rsp)
-       addl    %ecx,%r11d
-       movl    4(%rsp),%ebp
-       movl    %r13d,%eax
-       movl    %r13d,%ebx
+       movl    %esi,%ebx
        xorl    12(%rsp),%ebp
-       andl    %esi,%eax
+       andl    %r13d,%eax
        movl    %r11d,%ecx
        xorl    36(%rsp),%ebp
-       xorl    %esi,%ebx
        leal    -1894007588(%rdx,%rdi,1),%edi
+       xorl    %r13d,%ebx
        roll    $5,%ecx
-       xorl    56(%rsp),%ebp
        addl    %eax,%edi
-       andl    %r12d,%ebx
        roll    $1,%ebp
-       addl    %ebx,%edi
+       andl    %r12d,%ebx
+       addl    %ecx,%edi
        roll    $30,%r12d
+       addl    %ebx,%edi
+       xorl    8(%rsp),%r14d
+       movl    %r13d,%eax
        movl    %ebp,4(%rsp)
-       addl    %ecx,%edi
-       movl    8(%rsp),%edx
-       movl    %r12d,%eax
-       movl    %r12d,%ebx
-       xorl    16(%rsp),%edx
-       andl    %r13d,%eax
+       movl    %r13d,%ebx
+       xorl    16(%rsp),%r14d
+       andl    %r12d,%eax
        movl    %edi,%ecx
-       xorl    40(%rsp),%edx
-       xorl    %r13d,%ebx
+       xorl    40(%rsp),%r14d
        leal    -1894007588(%rbp,%rsi,1),%esi
+       xorl    %r12d,%ebx
        roll    $5,%ecx
-       xorl    60(%rsp),%edx
        addl    %eax,%esi
+       roll    $1,%r14d
        andl    %r11d,%ebx
-       roll    $1,%edx
-       addl    %ebx,%esi
-       roll    $30,%r11d
-       movl    %edx,8(%rsp)
        addl    %ecx,%esi
-       movl    12(%rsp),%ebp
-       movl    %r11d,%eax
-       movl    %r11d,%ebx
-       xorl    20(%rsp),%ebp
-       andl    %r12d,%eax
+       roll    $30,%r11d
+       addl    %ebx,%esi
+       xorl    12(%rsp),%edx
+       movl    %r12d,%eax
+       movl    %r14d,8(%rsp)
+       movl    %r12d,%ebx
+       xorl    20(%rsp),%edx
+       andl    %r11d,%eax
        movl    %esi,%ecx
-       xorl    44(%rsp),%ebp
-       xorl    %r12d,%ebx
-       leal    -1894007588(%rdx,%r13,1),%r13d
+       xorl    44(%rsp),%edx
+       leal    -1894007588(%r14,%r13,1),%r13d
+       xorl    %r11d,%ebx
        roll    $5,%ecx
-       xorl    0(%rsp),%ebp
        addl    %eax,%r13d
+       roll    $1,%edx
        andl    %edi,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%r13d
-       roll    $30,%edi
-       movl    %ebp,12(%rsp)
        addl    %ecx,%r13d
-       movl    16(%rsp),%edx
-       movl    %edi,%eax
-       movl    %edi,%ebx
-       xorl    24(%rsp),%edx
-       andl    %r11d,%eax
+       roll    $30,%edi
+       addl    %ebx,%r13d
+       xorl    16(%rsp),%ebp
+       movl    %r11d,%eax
+       movl    %edx,12(%rsp)
+       movl    %r11d,%ebx
+       xorl    24(%rsp),%ebp
+       andl    %edi,%eax
        movl    %r13d,%ecx
-       xorl    48(%rsp),%edx
-       xorl    %r11d,%ebx
-       leal    -1894007588(%rbp,%r12,1),%r12d
+       xorl    48(%rsp),%ebp
+       leal    -1894007588(%rdx,%r12,1),%r12d
+       xorl    %edi,%ebx
        roll    $5,%ecx
-       xorl    4(%rsp),%edx
        addl    %eax,%r12d
+       roll    $1,%ebp
        andl    %esi,%ebx
-       roll    $1,%edx
-       addl    %ebx,%r12d
-       roll    $30,%esi
-       movl    %edx,16(%rsp)
        addl    %ecx,%r12d
-       movl    20(%rsp),%ebp
-       movl    %esi,%eax
-       movl    %esi,%ebx
-       xorl    28(%rsp),%ebp
-       andl    %edi,%eax
+       roll    $30,%esi
+       addl    %ebx,%r12d
+       xorl    20(%rsp),%r14d
+       movl    %edi,%eax
+       movl    %ebp,16(%rsp)
+       movl    %edi,%ebx
+       xorl    28(%rsp),%r14d
+       andl    %esi,%eax
        movl    %r12d,%ecx
-       xorl    52(%rsp),%ebp
-       xorl    %edi,%ebx
-       leal    -1894007588(%rdx,%r11,1),%r11d
+       xorl    52(%rsp),%r14d
+       leal    -1894007588(%rbp,%r11,1),%r11d
+       xorl    %esi,%ebx
        roll    $5,%ecx
-       xorl    8(%rsp),%ebp
        addl    %eax,%r11d
+       roll    $1,%r14d
        andl    %r13d,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%r11d
-       roll    $30,%r13d
-       movl    %ebp,20(%rsp)
        addl    %ecx,%r11d
-       movl    24(%rsp),%edx
-       movl    %r13d,%eax
-       movl    %r13d,%ebx
+       roll    $30,%r13d
+       addl    %ebx,%r11d
+       xorl    24(%rsp),%edx
+       movl    %esi,%eax
+       movl    %r14d,20(%rsp)
+       movl    %esi,%ebx
        xorl    32(%rsp),%edx
-       andl    %esi,%eax
+       andl    %r13d,%eax
        movl    %r11d,%ecx
        xorl    56(%rsp),%edx
-       xorl    %esi,%ebx
-       leal    -1894007588(%rbp,%rdi,1),%edi
+       leal    -1894007588(%r14,%rdi,1),%edi
+       xorl    %r13d,%ebx
        roll    $5,%ecx
-       xorl    12(%rsp),%edx
        addl    %eax,%edi
-       andl    %r12d,%ebx
        roll    $1,%edx
-       addl    %ebx,%edi
+       andl    %r12d,%ebx
+       addl    %ecx,%edi
        roll    $30,%r12d
+       addl    %ebx,%edi
+       xorl    28(%rsp),%ebp
+       movl    %r13d,%eax
        movl    %edx,24(%rsp)
-       addl    %ecx,%edi
-       movl    28(%rsp),%ebp
-       movl    %r12d,%eax
-       movl    %r12d,%ebx
+       movl    %r13d,%ebx
        xorl    36(%rsp),%ebp
-       andl    %r13d,%eax
+       andl    %r12d,%eax
        movl    %edi,%ecx
        xorl    60(%rsp),%ebp
-       xorl    %r13d,%ebx
        leal    -1894007588(%rdx,%rsi,1),%esi
+       xorl    %r12d,%ebx
        roll    $5,%ecx
-       xorl    16(%rsp),%ebp
        addl    %eax,%esi
-       andl    %r11d,%ebx
        roll    $1,%ebp
-       addl    %ebx,%esi
+       andl    %r11d,%ebx
+       addl    %ecx,%esi
        roll    $30,%r11d
+       addl    %ebx,%esi
+       xorl    32(%rsp),%r14d
+       movl    %r12d,%eax
        movl    %ebp,28(%rsp)
-       addl    %ecx,%esi
-       movl    32(%rsp),%edx
-       movl    %r11d,%eax
-       movl    %r11d,%ebx
-       xorl    40(%rsp),%edx
-       andl    %r12d,%eax
+       movl    %r12d,%ebx
+       xorl    40(%rsp),%r14d
+       andl    %r11d,%eax
        movl    %esi,%ecx
-       xorl    0(%rsp),%edx
-       xorl    %r12d,%ebx
+       xorl    0(%rsp),%r14d
        leal    -1894007588(%rbp,%r13,1),%r13d
+       xorl    %r11d,%ebx
        roll    $5,%ecx
-       xorl    20(%rsp),%edx
        addl    %eax,%r13d
+       roll    $1,%r14d
        andl    %edi,%ebx
-       roll    $1,%edx
-       addl    %ebx,%r13d
-       roll    $30,%edi
-       movl    %edx,32(%rsp)
        addl    %ecx,%r13d
-       movl    36(%rsp),%ebp
-       movl    %edi,%eax
-       movl    %edi,%ebx
-       xorl    44(%rsp),%ebp
-       andl    %r11d,%eax
+       roll    $30,%edi
+       addl    %ebx,%r13d
+       xorl    36(%rsp),%edx
+       movl    %r11d,%eax
+       movl    %r14d,32(%rsp)
+       movl    %r11d,%ebx
+       xorl    44(%rsp),%edx
+       andl    %edi,%eax
        movl    %r13d,%ecx
-       xorl    4(%rsp),%ebp
-       xorl    %r11d,%ebx
-       leal    -1894007588(%rdx,%r12,1),%r12d
+       xorl    4(%rsp),%edx
+       leal    -1894007588(%r14,%r12,1),%r12d
+       xorl    %edi,%ebx
        roll    $5,%ecx
-       xorl    24(%rsp),%ebp
        addl    %eax,%r12d
+       roll    $1,%edx
        andl    %esi,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%r12d
-       roll    $30,%esi
-       movl    %ebp,36(%rsp)
        addl    %ecx,%r12d
-       movl    40(%rsp),%edx
-       movl    %esi,%eax
-       movl    %esi,%ebx
-       xorl    48(%rsp),%edx
-       andl    %edi,%eax
+       roll    $30,%esi
+       addl    %ebx,%r12d
+       xorl    40(%rsp),%ebp
+       movl    %edi,%eax
+       movl    %edx,36(%rsp)
+       movl    %edi,%ebx
+       xorl    48(%rsp),%ebp
+       andl    %esi,%eax
        movl    %r12d,%ecx
-       xorl    8(%rsp),%edx
-       xorl    %edi,%ebx
-       leal    -1894007588(%rbp,%r11,1),%r11d
+       xorl    8(%rsp),%ebp
+       leal    -1894007588(%rdx,%r11,1),%r11d
+       xorl    %esi,%ebx
        roll    $5,%ecx
-       xorl    28(%rsp),%edx
        addl    %eax,%r11d
+       roll    $1,%ebp
        andl    %r13d,%ebx
-       roll    $1,%edx
-       addl    %ebx,%r11d
-       roll    $30,%r13d
-       movl    %edx,40(%rsp)
        addl    %ecx,%r11d
-       movl    44(%rsp),%ebp
-       movl    %r13d,%eax
-       movl    %r13d,%ebx
-       xorl    52(%rsp),%ebp
-       andl    %esi,%eax
+       roll    $30,%r13d
+       addl    %ebx,%r11d
+       xorl    44(%rsp),%r14d
+       movl    %esi,%eax
+       movl    %ebp,40(%rsp)
+       movl    %esi,%ebx
+       xorl    52(%rsp),%r14d
+       andl    %r13d,%eax
        movl    %r11d,%ecx
-       xorl    12(%rsp),%ebp
-       xorl    %esi,%ebx
-       leal    -1894007588(%rdx,%rdi,1),%edi
+       xorl    12(%rsp),%r14d
+       leal    -1894007588(%rbp,%rdi,1),%edi
+       xorl    %r13d,%ebx
        roll    $5,%ecx
-       xorl    32(%rsp),%ebp
        addl    %eax,%edi
+       roll    $1,%r14d
        andl    %r12d,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%edi
-       roll    $30,%r12d
-       movl    %ebp,44(%rsp)
        addl    %ecx,%edi
-       movl    48(%rsp),%edx
-       movl    %r12d,%eax
-       movl    %r12d,%ebx
+       roll    $30,%r12d
+       addl    %ebx,%edi
+       xorl    48(%rsp),%edx
+       movl    %r13d,%eax
+       movl    %r14d,44(%rsp)
+       movl    %r13d,%ebx
        xorl    56(%rsp),%edx
-       andl    %r13d,%eax
+       andl    %r12d,%eax
        movl    %edi,%ecx
        xorl    16(%rsp),%edx
-       xorl    %r13d,%ebx
-       leal    -1894007588(%rbp,%rsi,1),%esi
+       leal    -1894007588(%r14,%rsi,1),%esi
+       xorl    %r12d,%ebx
        roll    $5,%ecx
-       xorl    36(%rsp),%edx
        addl    %eax,%esi
-       andl    %r11d,%ebx
        roll    $1,%edx
-       addl    %ebx,%esi
+       andl    %r11d,%ebx
+       addl    %ecx,%esi
        roll    $30,%r11d
+       addl    %ebx,%esi
+       xorl    52(%rsp),%ebp
+       movl    %edi,%eax
        movl    %edx,48(%rsp)
-       addl    %ecx,%esi
-       movl    52(%rsp),%ebp
-       movl    %r11d,%eax
        movl    %esi,%ecx
        xorl    60(%rsp),%ebp
-       xorl    %edi,%eax
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    -899497514(%rdx,%r13,1),%r13d
        xorl    20(%rsp),%ebp
-       xorl    %r12d,%eax
+       leal    -899497514(%rdx,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    40(%rsp),%ebp
        roll    $30,%edi
        addl    %eax,%r13d
        roll    $1,%ebp
+       xorl    56(%rsp),%r14d
+       movl    %esi,%eax
        movl    %ebp,52(%rsp)
-       movl    56(%rsp),%edx
-       movl    %edi,%eax
        movl    %r13d,%ecx
-       xorl    0(%rsp),%edx
-       xorl    %esi,%eax
+       xorl    0(%rsp),%r14d
+       xorl    %r11d,%eax
        roll    $5,%ecx
+       xorl    24(%rsp),%r14d
        leal    -899497514(%rbp,%r12,1),%r12d
-       xorl    24(%rsp),%edx
-       xorl    %r11d,%eax
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    44(%rsp),%edx
        roll    $30,%esi
        addl    %eax,%r12d
-       roll    $1,%edx
-       movl    %edx,56(%rsp)
-       movl    60(%rsp),%ebp
-       movl    %esi,%eax
+       roll    $1,%r14d
+       xorl    60(%rsp),%edx
+       movl    %r13d,%eax
+       movl    %r14d,56(%rsp)
        movl    %r12d,%ecx
-       xorl    4(%rsp),%ebp
-       xorl    %r13d,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rdx,%r11,1),%r11d
-       xorl    28(%rsp),%ebp
+       xorl    4(%rsp),%edx
        xorl    %edi,%eax
+       roll    $5,%ecx
+       xorl    28(%rsp),%edx
+       leal    -899497514(%r14,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    48(%rsp),%ebp
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%ebp
-       movl    %ebp,60(%rsp)
-       movl    0(%rsp),%edx
-       movl    %r13d,%eax
+       roll    $1,%edx
+       xorl    0(%rsp),%ebp
+       movl    %r12d,%eax
+       movl    %edx,60(%rsp)
        movl    %r11d,%ecx
-       xorl    8(%rsp),%edx
-       xorl    %r12d,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rbp,%rdi,1),%edi
-       xorl    32(%rsp),%edx
+       xorl    8(%rsp),%ebp
        xorl    %esi,%eax
+       roll    $5,%ecx
+       xorl    32(%rsp),%ebp
+       leal    -899497514(%rdx,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    52(%rsp),%edx
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%edx
-       movl    %edx,0(%rsp)
-       movl    4(%rsp),%ebp
-       movl    %r12d,%eax
+       roll    $1,%ebp
+       xorl    4(%rsp),%r14d
+       movl    %r11d,%eax
+       movl    %ebp,0(%rsp)
        movl    %edi,%ecx
-       xorl    12(%rsp),%ebp
-       xorl    %r11d,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rdx,%rsi,1),%esi
-       xorl    36(%rsp),%ebp
+       xorl    12(%rsp),%r14d
        xorl    %r13d,%eax
+       roll    $5,%ecx
+       xorl    36(%rsp),%r14d
+       leal    -899497514(%rbp,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    56(%rsp),%ebp
        roll    $30,%r11d
        addl    %eax,%esi
-       roll    $1,%ebp
-       movl    %ebp,4(%rsp)
-       movl    8(%rsp),%edx
-       movl    %r11d,%eax
+       roll    $1,%r14d
+       xorl    8(%rsp),%edx
+       movl    %edi,%eax
+       movl    %r14d,4(%rsp)
        movl    %esi,%ecx
        xorl    16(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    -899497514(%rbp,%r13,1),%r13d
        xorl    40(%rsp),%edx
-       xorl    %r12d,%eax
+       leal    -899497514(%r14,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    60(%rsp),%edx
        roll    $30,%edi
        addl    %eax,%r13d
        roll    $1,%edx
+       xorl    12(%rsp),%ebp
+       movl    %esi,%eax
        movl    %edx,8(%rsp)
-       movl    12(%rsp),%ebp
-       movl    %edi,%eax
        movl    %r13d,%ecx
        xorl    20(%rsp),%ebp
-       xorl    %esi,%eax
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    -899497514(%rdx,%r12,1),%r12d
        xorl    44(%rsp),%ebp
-       xorl    %r11d,%eax
+       leal    -899497514(%rdx,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    0(%rsp),%ebp
        roll    $30,%esi
        addl    %eax,%r12d
        roll    $1,%ebp
+       xorl    16(%rsp),%r14d
+       movl    %r13d,%eax
        movl    %ebp,12(%rsp)
-       movl    16(%rsp),%edx
-       movl    %esi,%eax
        movl    %r12d,%ecx
-       xorl    24(%rsp),%edx
-       xorl    %r13d,%eax
+       xorl    24(%rsp),%r14d
+       xorl    %edi,%eax
        roll    $5,%ecx
+       xorl    48(%rsp),%r14d
        leal    -899497514(%rbp,%r11,1),%r11d
-       xorl    48(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    4(%rsp),%edx
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%edx
-       movl    %edx,16(%rsp)
-       movl    20(%rsp),%ebp
-       movl    %r13d,%eax
+       roll    $1,%r14d
+       xorl    20(%rsp),%edx
+       movl    %r12d,%eax
+       movl    %r14d,16(%rsp)
        movl    %r11d,%ecx
-       xorl    28(%rsp),%ebp
-       xorl    %r12d,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rdx,%rdi,1),%edi
-       xorl    52(%rsp),%ebp
+       xorl    28(%rsp),%edx
        xorl    %esi,%eax
+       roll    $5,%ecx
+       xorl    52(%rsp),%edx
+       leal    -899497514(%r14,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    8(%rsp),%ebp
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%ebp
-       movl    %ebp,20(%rsp)
-       movl    24(%rsp),%edx
-       movl    %r12d,%eax
+       roll    $1,%edx
+       xorl    24(%rsp),%ebp
+       movl    %r11d,%eax
+       movl    %edx,20(%rsp)
        movl    %edi,%ecx
-       xorl    32(%rsp),%edx
-       xorl    %r11d,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rbp,%rsi,1),%esi
-       xorl    56(%rsp),%edx
+       xorl    32(%rsp),%ebp
        xorl    %r13d,%eax
+       roll    $5,%ecx
+       xorl    56(%rsp),%ebp
+       leal    -899497514(%rdx,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    12(%rsp),%edx
        roll    $30,%r11d
        addl    %eax,%esi
-       roll    $1,%edx
-       movl    %edx,24(%rsp)
-       movl    28(%rsp),%ebp
-       movl    %r11d,%eax
+       roll    $1,%ebp
+       xorl    28(%rsp),%r14d
+       movl    %edi,%eax
+       movl    %ebp,24(%rsp)
        movl    %esi,%ecx
-       xorl    36(%rsp),%ebp
-       xorl    %edi,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rdx,%r13,1),%r13d
-       xorl    60(%rsp),%ebp
+       xorl    36(%rsp),%r14d
        xorl    %r12d,%eax
+       roll    $5,%ecx
+       xorl    60(%rsp),%r14d
+       leal    -899497514(%rbp,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    16(%rsp),%ebp
        roll    $30,%edi
        addl    %eax,%r13d
-       roll    $1,%ebp
-       movl    %ebp,28(%rsp)
-       movl    32(%rsp),%edx
-       movl    %edi,%eax
+       roll    $1,%r14d
+       xorl    32(%rsp),%edx
+       movl    %esi,%eax
+       movl    %r14d,28(%rsp)
        movl    %r13d,%ecx
        xorl    40(%rsp),%edx
-       xorl    %esi,%eax
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    -899497514(%rbp,%r12,1),%r12d
        xorl    0(%rsp),%edx
-       xorl    %r11d,%eax
+       leal    -899497514(%r14,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    20(%rsp),%edx
        roll    $30,%esi
        addl    %eax,%r12d
        roll    $1,%edx
-       movl    %edx,32(%rsp)
-       movl    36(%rsp),%ebp
-       movl    %esi,%eax
+       xorl    36(%rsp),%ebp
+       movl    %r13d,%eax
+
        movl    %r12d,%ecx
        xorl    44(%rsp),%ebp
-       xorl    %r13d,%eax
+       xorl    %edi,%eax
        roll    $5,%ecx
-       leal    -899497514(%rdx,%r11,1),%r11d
        xorl    4(%rsp),%ebp
-       xorl    %edi,%eax
+       leal    -899497514(%rdx,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    24(%rsp),%ebp
        roll    $30,%r13d
        addl    %eax,%r11d
        roll    $1,%ebp
-       movl    %ebp,36(%rsp)
-       movl    40(%rsp),%edx
-       movl    %r13d,%eax
+       xorl    40(%rsp),%r14d
+       movl    %r12d,%eax
+
        movl    %r11d,%ecx
-       xorl    48(%rsp),%edx
-       xorl    %r12d,%eax
+       xorl    48(%rsp),%r14d
+       xorl    %esi,%eax
        roll    $5,%ecx
+       xorl    8(%rsp),%r14d
        leal    -899497514(%rbp,%rdi,1),%edi
-       xorl    8(%rsp),%edx
-       xorl    %esi,%eax
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    28(%rsp),%edx
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%edx
-       movl    %edx,40(%rsp)
-       movl    44(%rsp),%ebp
-       movl    %r12d,%eax
+       roll    $1,%r14d
+       xorl    44(%rsp),%edx
+       movl    %r11d,%eax
+
        movl    %edi,%ecx
-       xorl    52(%rsp),%ebp
-       xorl    %r11d,%eax
+       xorl    52(%rsp),%edx
+       xorl    %r13d,%eax
        roll    $5,%ecx
-       leal    -899497514(%rdx,%rsi,1),%esi
-       xorl    12(%rsp),%ebp
-       xorl    %r13d,%eax
+       xorl    12(%rsp),%edx
+       leal    -899497514(%r14,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    32(%rsp),%ebp
        roll    $30,%r11d
        addl    %eax,%esi
-       roll    $1,%ebp
-       movl    %ebp,44(%rsp)
-       movl    48(%rsp),%edx
-       movl    %r11d,%eax
+       roll    $1,%edx
+       xorl    48(%rsp),%ebp
+       movl    %edi,%eax
+
        movl    %esi,%ecx
-       xorl    56(%rsp),%edx
-       xorl    %edi,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rbp,%r13,1),%r13d
-       xorl    16(%rsp),%edx
+       xorl    56(%rsp),%ebp
        xorl    %r12d,%eax
+       roll    $5,%ecx
+       xorl    16(%rsp),%ebp
+       leal    -899497514(%rdx,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    36(%rsp),%edx
        roll    $30,%edi
        addl    %eax,%r13d
-       roll    $1,%edx
-       movl    %edx,48(%rsp)
-       movl    52(%rsp),%ebp
-       movl    %edi,%eax
+       roll    $1,%ebp
+       xorl    52(%rsp),%r14d
+       movl    %esi,%eax
+
        movl    %r13d,%ecx
-       xorl    60(%rsp),%ebp
-       xorl    %esi,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rdx,%r12,1),%r12d
-       xorl    20(%rsp),%ebp
+       xorl    60(%rsp),%r14d
        xorl    %r11d,%eax
+       roll    $5,%ecx
+       xorl    20(%rsp),%r14d
+       leal    -899497514(%rbp,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    40(%rsp),%ebp
        roll    $30,%esi
        addl    %eax,%r12d
-       roll    $1,%ebp
-       movl    56(%rsp),%edx
-       movl    %esi,%eax
+       roll    $1,%r14d
+       xorl    56(%rsp),%edx
+       movl    %r13d,%eax
+
        movl    %r12d,%ecx
        xorl    0(%rsp),%edx
-       xorl    %r13d,%eax
+       xorl    %edi,%eax
        roll    $5,%ecx
-       leal    -899497514(%rbp,%r11,1),%r11d
        xorl    24(%rsp),%edx
-       xorl    %edi,%eax
+       leal    -899497514(%r14,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    44(%rsp),%edx
        roll    $30,%r13d
        addl    %eax,%r11d
        roll    $1,%edx
-       movl    60(%rsp),%ebp
-       movl    %r13d,%eax
+       xorl    60(%rsp),%ebp
+       movl    %r12d,%eax
+
        movl    %r11d,%ecx
        xorl    4(%rsp),%ebp
-       xorl    %r12d,%eax
+       xorl    %esi,%eax
        roll    $5,%ecx
-       leal    -899497514(%rdx,%rdi,1),%edi
        xorl    28(%rsp),%ebp
-       xorl    %esi,%eax
+       leal    -899497514(%rdx,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    48(%rsp),%ebp
        roll    $30,%r12d
        addl    %eax,%edi
        roll    $1,%ebp
-       movl    %r12d,%eax
+       movl    %r11d,%eax
        movl    %edi,%ecx
-       xorl    %r11d,%eax
+       xorl    %r13d,%eax
        leal    -899497514(%rbp,%rsi,1),%esi
        roll    $5,%ecx
-       xorl    %r13d,%eax
+       xorl    %r12d,%eax
        addl    %ecx,%esi
        roll    $30,%r11d
        addl    %eax,%esi
@@ -1327,16 +1285,216 @@ sha1_block_data_order:
        jnz     .Lloop
 
        movq    64(%rsp),%rsi
-       movq    (%rsi),%r13
-       movq    8(%rsi),%r12
-       movq    16(%rsi),%rbp
-       movq    24(%rsi),%rbx
-       leaq    32(%rsi),%rsp
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
 .Lepilogue:
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
+
 .LSEH_end_sha1_block_data_order:
+.def   sha1_block_data_order_shaext;   .scl 3; .type 32;       .endef
+.p2align       5
+sha1_block_data_order_shaext:
+       movq    %rdi,8(%rsp)
+       movq    %rsi,16(%rsp)
+       movq    %rsp,%rax
+.LSEH_begin_sha1_block_data_order_shaext:
+       movq    %rcx,%rdi
+       movq    %rdx,%rsi
+       movq    %r8,%rdx
+
+_shaext_shortcut:
+
+       leaq    -72(%rsp),%rsp
+       movaps  %xmm6,-8-64(%rax)
+       movaps  %xmm7,-8-48(%rax)
+       movaps  %xmm8,-8-32(%rax)
+       movaps  %xmm9,-8-16(%rax)
+.Lprologue_shaext:
+       movdqu  (%rdi),%xmm0
+       movd    16(%rdi),%xmm1
+       movdqa  K_XX_XX+160(%rip),%xmm3
+
+       movdqu  (%rsi),%xmm4
+       pshufd  $27,%xmm0,%xmm0
+       movdqu  16(%rsi),%xmm5
+       pshufd  $27,%xmm1,%xmm1
+       movdqu  32(%rsi),%xmm6
+.byte  102,15,56,0,227
+       movdqu  48(%rsi),%xmm7
+.byte  102,15,56,0,235
+.byte  102,15,56,0,243
+       movdqa  %xmm1,%xmm9
+.byte  102,15,56,0,251
+       jmp     .Loop_shaext
+
+.p2align       4
+.Loop_shaext:
+       decq    %rdx
+       leaq    64(%rsi),%r8
+       paddd   %xmm4,%xmm1
+       cmovneq %r8,%rsi
+       movdqa  %xmm0,%xmm8
+.byte  15,56,201,229
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,0
+.byte  15,56,200,213
+       pxor    %xmm6,%xmm4
+.byte  15,56,201,238
+.byte  15,56,202,231
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,0
+.byte  15,56,200,206
+       pxor    %xmm7,%xmm5
+.byte  15,56,202,236
+.byte  15,56,201,247
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,0
+.byte  15,56,200,215
+       pxor    %xmm4,%xmm6
+.byte  15,56,201,252
+.byte  15,56,202,245
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,0
+.byte  15,56,200,204
+       pxor    %xmm5,%xmm7
+.byte  15,56,202,254
+.byte  15,56,201,229
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,0
+.byte  15,56,200,213
+       pxor    %xmm6,%xmm4
+.byte  15,56,201,238
+.byte  15,56,202,231
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,1
+.byte  15,56,200,206
+       pxor    %xmm7,%xmm5
+.byte  15,56,202,236
+.byte  15,56,201,247
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,1
+.byte  15,56,200,215
+       pxor    %xmm4,%xmm6
+.byte  15,56,201,252
+.byte  15,56,202,245
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,1
+.byte  15,56,200,204
+       pxor    %xmm5,%xmm7
+.byte  15,56,202,254
+.byte  15,56,201,229
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,1
+.byte  15,56,200,213
+       pxor    %xmm6,%xmm4
+.byte  15,56,201,238
+.byte  15,56,202,231
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,1
+.byte  15,56,200,206
+       pxor    %xmm7,%xmm5
+.byte  15,56,202,236
+.byte  15,56,201,247
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,2
+.byte  15,56,200,215
+       pxor    %xmm4,%xmm6
+.byte  15,56,201,252
+.byte  15,56,202,245
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,2
+.byte  15,56,200,204
+       pxor    %xmm5,%xmm7
+.byte  15,56,202,254
+.byte  15,56,201,229
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,2
+.byte  15,56,200,213
+       pxor    %xmm6,%xmm4
+.byte  15,56,201,238
+.byte  15,56,202,231
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,2
+.byte  15,56,200,206
+       pxor    %xmm7,%xmm5
+.byte  15,56,202,236
+.byte  15,56,201,247
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,2
+.byte  15,56,200,215
+       pxor    %xmm4,%xmm6
+.byte  15,56,201,252
+.byte  15,56,202,245
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,3
+.byte  15,56,200,204
+       pxor    %xmm5,%xmm7
+.byte  15,56,202,254
+       movdqu  (%rsi),%xmm4
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,3
+.byte  15,56,200,213
+       movdqu  16(%rsi),%xmm5
+.byte  102,15,56,0,227
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,3
+.byte  15,56,200,206
+       movdqu  32(%rsi),%xmm6
+.byte  102,15,56,0,235
+
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,3
+.byte  15,56,200,215
+       movdqu  48(%rsi),%xmm7
+.byte  102,15,56,0,243
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,3
+.byte  65,15,56,200,201
+.byte  102,15,56,0,251
+
+       paddd   %xmm8,%xmm0
+       movdqa  %xmm1,%xmm9
+
+       jnz     .Loop_shaext
+
+       pshufd  $27,%xmm0,%xmm0
+       pshufd  $27,%xmm1,%xmm1
+       movdqu  %xmm0,(%rdi)
+       movd    %xmm1,16(%rdi)
+       movaps  -8-64(%rax),%xmm6
+       movaps  -8-48(%rax),%xmm7
+       movaps  -8-32(%rax),%xmm8
+       movaps  -8-16(%rax),%xmm9
+       movq    %rax,%rsp
+.Lepilogue_shaext:
+
+       movq    8(%rsp),%rdi
+       movq    16(%rsp),%rsi
+       .byte   0xf3,0xc3
+.LSEH_end_sha1_block_data_order_shaext:
 .def   sha1_block_data_order_ssse3;    .scl 3; .type 32;       .endef
 .p2align       4
 sha1_block_data_order_ssse3:
@@ -1349,24 +1507,35 @@ sha1_block_data_order_ssse3:
        movq    %r8,%rdx
 
 _ssse3_shortcut:
+
+       movq    %rsp,%r11
+
        pushq   %rbx
+
        pushq   %rbp
+
        pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
        leaq    -160(%rsp),%rsp
-       movaps  %xmm6,64+0(%rsp)
-       movaps  %xmm7,64+16(%rsp)
-       movaps  %xmm8,64+32(%rsp)
-       movaps  %xmm9,64+48(%rsp)
-       movaps  %xmm10,64+64(%rsp)
-       movaps  %xmm11,64+80(%rsp)
+       movaps  %xmm6,-40-96(%r11)
+       movaps  %xmm7,-40-80(%r11)
+       movaps  %xmm8,-40-64(%r11)
+       movaps  %xmm9,-40-48(%r11)
+       movaps  %xmm10,-40-32(%r11)
+       movaps  %xmm11,-40-16(%r11)
 .Lprologue_ssse3:
+       andq    $-64,%rsp
        movq    %rdi,%r8
        movq    %rsi,%r9
        movq    %rdx,%r10
 
        shlq    $6,%r10
        addq    %r9,%r10
-       leaq    K_XX_XX(%rip),%r11
+       leaq    K_XX_XX+64(%rip),%r14
 
        movl    0(%r8),%eax
        movl    4(%r8),%ebx
@@ -1378,18 +1547,18 @@ _ssse3_shortcut:
        xorl    %edx,%edi
        andl    %edi,%esi
 
-       movdqa  64(%r11),%xmm6
-       movdqa  0(%r11),%xmm9
+       movdqa  64(%r14),%xmm6
+       movdqa  -64(%r14),%xmm9
        movdqu  0(%r9),%xmm0
        movdqu  16(%r9),%xmm1
        movdqu  32(%r9),%xmm2
        movdqu  48(%r9),%xmm3
 .byte  102,15,56,0,198
-       addq    $64,%r9
 .byte  102,15,56,0,206
 .byte  102,15,56,0,214
-.byte  102,15,56,0,222
+       addq    $64,%r9
        paddd   %xmm9,%xmm0
+.byte  102,15,56,0,222
        paddd   %xmm9,%xmm1
        paddd   %xmm9,%xmm2
        movdqa  %xmm0,0(%rsp)
@@ -1401,24 +1570,24 @@ _ssse3_shortcut:
        jmp     .Loop_ssse3
 .p2align       4
 .Loop_ssse3:
-       movdqa  %xmm1,%xmm4
        rorl    $2,%ebx
+       pshufd  $238,%xmm0,%xmm4
        xorl    %edx,%esi
        movdqa  %xmm3,%xmm8
-.byte  102,15,58,15,224,8
+       paddd   %xmm3,%xmm9
        movl    %eax,%edi
        addl    0(%rsp),%ebp
-       paddd   %xmm3,%xmm9
+       punpcklqdq      %xmm1,%xmm4
        xorl    %ecx,%ebx
        roll    $5,%eax
-       psrldq  $4,%xmm8
        addl    %esi,%ebp
+       psrldq  $4,%xmm8
        andl    %ebx,%edi
-       pxor    %xmm0,%xmm4
        xorl    %ecx,%ebx
+       pxor    %xmm0,%xmm4
        addl    %eax,%ebp
-       pxor    %xmm2,%xmm8
        rorl    $7,%eax
+       pxor    %xmm2,%xmm8
        xorl    %ecx,%edi
        movl    %ebp,%esi
        addl    4(%rsp),%edx
@@ -1429,57 +1598,57 @@ _ssse3_shortcut:
        addl    %edi,%edx
        andl    %eax,%esi
        movdqa  %xmm4,%xmm10
-       movdqa  %xmm4,%xmm8
        xorl    %ebx,%eax
        addl    %ebp,%edx
        rorl    $7,%ebp
+       movdqa  %xmm4,%xmm8
        xorl    %ebx,%esi
        pslldq  $12,%xmm10
        paddd   %xmm4,%xmm4
        movl    %edx,%edi
        addl    8(%rsp),%ecx
+       psrld   $31,%xmm8
        xorl    %eax,%ebp
        roll    $5,%edx
-       psrld   $31,%xmm8
        addl    %esi,%ecx
-       andl    %ebp,%edi
        movdqa  %xmm10,%xmm9
+       andl    %ebp,%edi
        xorl    %eax,%ebp
-       addl    %edx,%ecx
        psrld   $30,%xmm10
-       por     %xmm8,%xmm4
+       addl    %edx,%ecx
        rorl    $7,%edx
+       por     %xmm8,%xmm4
        xorl    %eax,%edi
        movl    %ecx,%esi
        addl    12(%rsp),%ebx
        pslld   $2,%xmm9
        pxor    %xmm10,%xmm4
        xorl    %ebp,%edx
+       movdqa  -64(%r14),%xmm10
        roll    $5,%ecx
-       movdqa  0(%r11),%xmm10
        addl    %edi,%ebx
        andl    %edx,%esi
        pxor    %xmm9,%xmm4
        xorl    %ebp,%edx
        addl    %ecx,%ebx
-       movdqa  %xmm2,%xmm5
        rorl    $7,%ecx
+       pshufd  $238,%xmm1,%xmm5
        xorl    %ebp,%esi
        movdqa  %xmm4,%xmm9
-.byte  102,15,58,15,233,8
+       paddd   %xmm4,%xmm10
        movl    %ebx,%edi
        addl    16(%rsp),%eax
-       paddd   %xmm4,%xmm10
+       punpcklqdq      %xmm2,%xmm5
        xorl    %edx,%ecx
        roll    $5,%ebx
-       psrldq  $4,%xmm9
        addl    %esi,%eax
+       psrldq  $4,%xmm9
        andl    %ecx,%edi
-       pxor    %xmm1,%xmm5
        xorl    %edx,%ecx
+       pxor    %xmm1,%xmm5
        addl    %ebx,%eax
-       pxor    %xmm3,%xmm9
        rorl    $7,%ebx
+       pxor    %xmm3,%xmm9
        xorl    %edx,%edi
        movl    %eax,%esi
        addl    20(%rsp),%ebp
@@ -1490,57 +1659,57 @@ _ssse3_shortcut:
        addl    %edi,%ebp
        andl    %ebx,%esi
        movdqa  %xmm5,%xmm8
-       movdqa  %xmm5,%xmm9
        xorl    %ecx,%ebx
        addl    %eax,%ebp
        rorl    $7,%eax
+       movdqa  %xmm5,%xmm9
        xorl    %ecx,%esi
        pslldq  $12,%xmm8
        paddd   %xmm5,%xmm5
        movl    %ebp,%edi
        addl    24(%rsp),%edx
+       psrld   $31,%xmm9
        xorl    %ebx,%eax
        roll    $5,%ebp
-       psrld   $31,%xmm9
        addl    %esi,%edx
-       andl    %eax,%edi
        movdqa  %xmm8,%xmm10
+       andl    %eax,%edi
        xorl    %ebx,%eax
-       addl    %ebp,%edx
        psrld   $30,%xmm8
-       por     %xmm9,%xmm5
+       addl    %ebp,%edx
        rorl    $7,%ebp
+       por     %xmm9,%xmm5
        xorl    %ebx,%edi
        movl    %edx,%esi
        addl    28(%rsp),%ecx
        pslld   $2,%xmm10
        pxor    %xmm8,%xmm5
        xorl    %eax,%ebp
+       movdqa  -32(%r14),%xmm8
        roll    $5,%edx
-       movdqa  16(%r11),%xmm8
        addl    %edi,%ecx
        andl    %ebp,%esi
        pxor    %xmm10,%xmm5
        xorl    %eax,%ebp
        addl    %edx,%ecx
-       movdqa  %xmm3,%xmm6
        rorl    $7,%edx
+       pshufd  $238,%xmm2,%xmm6
        xorl    %eax,%esi
        movdqa  %xmm5,%xmm10
-.byte  102,15,58,15,242,8
+       paddd   %xmm5,%xmm8
        movl    %ecx,%edi
        addl    32(%rsp),%ebx
-       paddd   %xmm5,%xmm8
+       punpcklqdq      %xmm3,%xmm6
        xorl    %ebp,%edx
        roll    $5,%ecx
-       psrldq  $4,%xmm10
        addl    %esi,%ebx
+       psrldq  $4,%xmm10
        andl    %edx,%edi
-       pxor    %xmm2,%xmm6
        xorl    %ebp,%edx
+       pxor    %xmm2,%xmm6
        addl    %ecx,%ebx
-       pxor    %xmm4,%xmm10
        rorl    $7,%ecx
+       pxor    %xmm4,%xmm10
        xorl    %ebp,%edi
        movl    %ebx,%esi
        addl    36(%rsp),%eax
@@ -1551,57 +1720,57 @@ _ssse3_shortcut:
        addl    %edi,%eax
        andl    %ecx,%esi
        movdqa  %xmm6,%xmm9
-       movdqa  %xmm6,%xmm10
        xorl    %edx,%ecx
        addl    %ebx,%eax
        rorl    $7,%ebx
+       movdqa  %xmm6,%xmm10
        xorl    %edx,%esi
        pslldq  $12,%xmm9
        paddd   %xmm6,%xmm6
        movl    %eax,%edi
        addl    40(%rsp),%ebp
+       psrld   $31,%xmm10
        xorl    %ecx,%ebx
        roll    $5,%eax
-       psrld   $31,%xmm10
        addl    %esi,%ebp
-       andl    %ebx,%edi
        movdqa  %xmm9,%xmm8
+       andl    %ebx,%edi
        xorl    %ecx,%ebx
-       addl    %eax,%ebp
        psrld   $30,%xmm9
-       por     %xmm10,%xmm6
+       addl    %eax,%ebp
        rorl    $7,%eax
+       por     %xmm10,%xmm6
        xorl    %ecx,%edi
        movl    %ebp,%esi
        addl    44(%rsp),%edx
        pslld   $2,%xmm8
        pxor    %xmm9,%xmm6
        xorl    %ebx,%eax
+       movdqa  -32(%r14),%xmm9
        roll    $5,%ebp
-       movdqa  16(%r11),%xmm9
        addl    %edi,%edx
        andl    %eax,%esi
        pxor    %xmm8,%xmm6
        xorl    %ebx,%eax
        addl    %ebp,%edx
-       movdqa  %xmm4,%xmm7
        rorl    $7,%ebp
+       pshufd  $238,%xmm3,%xmm7
        xorl    %ebx,%esi
        movdqa  %xmm6,%xmm8
-.byte  102,15,58,15,251,8
+       paddd   %xmm6,%xmm9
        movl    %edx,%edi
        addl    48(%rsp),%ecx
-       paddd   %xmm6,%xmm9
+       punpcklqdq      %xmm4,%xmm7
        xorl    %eax,%ebp
        roll    $5,%edx
-       psrldq  $4,%xmm8
        addl    %esi,%ecx
+       psrldq  $4,%xmm8
        andl    %ebp,%edi
-       pxor    %xmm3,%xmm7
        xorl    %eax,%ebp
+       pxor    %xmm3,%xmm7
        addl    %edx,%ecx
-       pxor    %xmm5,%xmm8
        rorl    $7,%edx
+       pxor    %xmm5,%xmm8
        xorl    %eax,%edi
        movl    %ecx,%esi
        addl    52(%rsp),%ebx
@@ -1612,78 +1781,78 @@ _ssse3_shortcut:
        addl    %edi,%ebx
        andl    %edx,%esi
        movdqa  %xmm7,%xmm10
-       movdqa  %xmm7,%xmm8
        xorl    %ebp,%edx
        addl    %ecx,%ebx
        rorl    $7,%ecx
+       movdqa  %xmm7,%xmm8
        xorl    %ebp,%esi
        pslldq  $12,%xmm10
        paddd   %xmm7,%xmm7
        movl    %ebx,%edi
        addl    56(%rsp),%eax
+       psrld   $31,%xmm8
        xorl    %edx,%ecx
        roll    $5,%ebx
-       psrld   $31,%xmm8
        addl    %esi,%eax
-       andl    %ecx,%edi
        movdqa  %xmm10,%xmm9
+       andl    %ecx,%edi
        xorl    %edx,%ecx
-       addl    %ebx,%eax
        psrld   $30,%xmm10
-       por     %xmm8,%xmm7
+       addl    %ebx,%eax
        rorl    $7,%ebx
+       por     %xmm8,%xmm7
        xorl    %edx,%edi
        movl    %eax,%esi
        addl    60(%rsp),%ebp
        pslld   $2,%xmm9
        pxor    %xmm10,%xmm7
        xorl    %ecx,%ebx
+       movdqa  -32(%r14),%xmm10
        roll    $5,%eax
-       movdqa  16(%r11),%xmm10
        addl    %edi,%ebp
        andl    %ebx,%esi
        pxor    %xmm9,%xmm7
+       pshufd  $238,%xmm6,%xmm9
        xorl    %ecx,%ebx
        addl    %eax,%ebp
-       movdqa  %xmm7,%xmm9
        rorl    $7,%eax
        pxor    %xmm4,%xmm0
-.byte  102,68,15,58,15,206,8
        xorl    %ecx,%esi
        movl    %ebp,%edi
        addl    0(%rsp),%edx
-       pxor    %xmm1,%xmm0
+       punpcklqdq      %xmm7,%xmm9
        xorl    %ebx,%eax
        roll    $5,%ebp
-       movdqa  %xmm10,%xmm8
-       paddd   %xmm7,%xmm10
+       pxor    %xmm1,%xmm0
        addl    %esi,%edx
        andl    %eax,%edi
-       pxor    %xmm9,%xmm0
+       movdqa  %xmm10,%xmm8
        xorl    %ebx,%eax
+       paddd   %xmm7,%xmm10
        addl    %ebp,%edx
+       pxor    %xmm9,%xmm0
        rorl    $7,%ebp
        xorl    %ebx,%edi
-       movdqa  %xmm0,%xmm9
-       movdqa  %xmm10,48(%rsp)
        movl    %edx,%esi
        addl    4(%rsp),%ecx
+       movdqa  %xmm0,%xmm9
        xorl    %eax,%ebp
        roll    $5,%edx
-       pslld   $2,%xmm0
+       movdqa  %xmm10,48(%rsp)
        addl    %edi,%ecx
        andl    %ebp,%esi
-       psrld   $30,%xmm9
        xorl    %eax,%ebp
+       pslld   $2,%xmm0
        addl    %edx,%ecx
        rorl    $7,%edx
+       psrld   $30,%xmm9
        xorl    %eax,%esi
        movl    %ecx,%edi
        addl    8(%rsp),%ebx
        por     %xmm9,%xmm0
        xorl    %ebp,%edx
        roll    $5,%ecx
-       movdqa  %xmm0,%xmm10
+       pshufd  $238,%xmm7,%xmm10
        addl    %esi,%ebx
        andl    %edx,%edi
        xorl    %ebp,%edx
@@ -1696,18 +1865,18 @@ _ssse3_shortcut:
        xorl    %edx,%esi
        rorl    $7,%ecx
        addl    %ebx,%eax
-       addl    16(%rsp),%ebp
        pxor    %xmm5,%xmm1
-.byte  102,68,15,58,15,215,8
+       addl    16(%rsp),%ebp
        xorl    %ecx,%esi
+       punpcklqdq      %xmm0,%xmm10
        movl    %eax,%edi
        roll    $5,%eax
        pxor    %xmm2,%xmm1
        addl    %esi,%ebp
        xorl    %ecx,%edi
        movdqa  %xmm8,%xmm9
-       paddd   %xmm0,%xmm8
        rorl    $7,%ebx
+       paddd   %xmm0,%xmm8
        addl    %eax,%ebp
        pxor    %xmm10,%xmm1
        addl    20(%rsp),%edx
@@ -1715,43 +1884,43 @@ _ssse3_shortcut:
        movl    %ebp,%esi
        roll    $5,%ebp
        movdqa  %xmm1,%xmm10
-       movdqa  %xmm8,0(%rsp)
        addl    %edi,%edx
        xorl    %ebx,%esi
+       movdqa  %xmm8,0(%rsp)
        rorl    $7,%eax
        addl    %ebp,%edx
-       pslld   $2,%xmm1
        addl    24(%rsp),%ecx
+       pslld   $2,%xmm1
        xorl    %eax,%esi
-       psrld   $30,%xmm10
        movl    %edx,%edi
+       psrld   $30,%xmm10
        roll    $5,%edx
        addl    %esi,%ecx
        xorl    %eax,%edi
        rorl    $7,%ebp
-       addl    %edx,%ecx
        por     %xmm10,%xmm1
+       addl    %edx,%ecx
        addl    28(%rsp),%ebx
+       pshufd  $238,%xmm0,%xmm8
        xorl    %ebp,%edi
-       movdqa  %xmm1,%xmm8
        movl    %ecx,%esi
        roll    $5,%ecx
        addl    %edi,%ebx
        xorl    %ebp,%esi
        rorl    $7,%edx
        addl    %ecx,%ebx
-       addl    32(%rsp),%eax
        pxor    %xmm6,%xmm2
-.byte  102,68,15,58,15,192,8
+       addl    32(%rsp),%eax
        xorl    %edx,%esi
+       punpcklqdq      %xmm1,%xmm8
        movl    %ebx,%edi
        roll    $5,%ebx
        pxor    %xmm3,%xmm2
        addl    %esi,%eax
        xorl    %edx,%edi
-       movdqa  32(%r11),%xmm10
-       paddd   %xmm1,%xmm9
+       movdqa  0(%r14),%xmm10
        rorl    $7,%ecx
+       paddd   %xmm1,%xmm9
        addl    %ebx,%eax
        pxor    %xmm8,%xmm2
        addl    36(%rsp),%ebp
@@ -1759,43 +1928,43 @@ _ssse3_shortcut:
        movl    %eax,%esi
        roll    $5,%eax
        movdqa  %xmm2,%xmm8
-       movdqa  %xmm9,16(%rsp)
        addl    %edi,%ebp
        xorl    %ecx,%esi
+       movdqa  %xmm9,16(%rsp)
        rorl    $7,%ebx
        addl    %eax,%ebp
-       pslld   $2,%xmm2
        addl    40(%rsp),%edx
+       pslld   $2,%xmm2
        xorl    %ebx,%esi
-       psrld   $30,%xmm8
        movl    %ebp,%edi
+       psrld   $30,%xmm8
        roll    $5,%ebp
        addl    %esi,%edx
        xorl    %ebx,%edi
        rorl    $7,%eax
-       addl    %ebp,%edx
        por     %xmm8,%xmm2
+       addl    %ebp,%edx
        addl    44(%rsp),%ecx
+       pshufd  $238,%xmm1,%xmm9
        xorl    %eax,%edi
-       movdqa  %xmm2,%xmm9
        movl    %edx,%esi
        roll    $5,%edx
        addl    %edi,%ecx
        xorl    %eax,%esi
        rorl    $7,%ebp
        addl    %edx,%ecx
-       addl    48(%rsp),%ebx
        pxor    %xmm7,%xmm3
-.byte  102,68,15,58,15,201,8
+       addl    48(%rsp),%ebx
        xorl    %ebp,%esi
+       punpcklqdq      %xmm2,%xmm9
        movl    %ecx,%edi
        roll    $5,%ecx
        pxor    %xmm4,%xmm3
        addl    %esi,%ebx
        xorl    %ebp,%edi
        movdqa  %xmm10,%xmm8
-       paddd   %xmm2,%xmm10
        rorl    $7,%edx
+       paddd   %xmm2,%xmm10
        addl    %ecx,%ebx
        pxor    %xmm9,%xmm3
        addl    52(%rsp),%eax
@@ -1803,43 +1972,43 @@ _ssse3_shortcut:
        movl    %ebx,%esi
        roll    $5,%ebx
        movdqa  %xmm3,%xmm9
-       movdqa  %xmm10,32(%rsp)
        addl    %edi,%eax
        xorl    %edx,%esi
+       movdqa  %xmm10,32(%rsp)
        rorl    $7,%ecx
        addl    %ebx,%eax
-       pslld   $2,%xmm3
        addl    56(%rsp),%ebp
+       pslld   $2,%xmm3
        xorl    %ecx,%esi
-       psrld   $30,%xmm9
        movl    %eax,%edi
+       psrld   $30,%xmm9
        roll    $5,%eax
        addl    %esi,%ebp
        xorl    %ecx,%edi
        rorl    $7,%ebx
-       addl    %eax,%ebp
        por     %xmm9,%xmm3
+       addl    %eax,%ebp
        addl    60(%rsp),%edx
+       pshufd  $238,%xmm2,%xmm10
        xorl    %ebx,%edi
-       movdqa  %xmm3,%xmm10
        movl    %ebp,%esi
        roll    $5,%ebp
        addl    %edi,%edx
        xorl    %ebx,%esi
        rorl    $7,%eax
        addl    %ebp,%edx
-       addl    0(%rsp),%ecx
        pxor    %xmm0,%xmm4
-.byte  102,68,15,58,15,210,8
+       addl    0(%rsp),%ecx
        xorl    %eax,%esi
+       punpcklqdq      %xmm3,%xmm10
        movl    %edx,%edi
        roll    $5,%edx
        pxor    %xmm5,%xmm4
        addl    %esi,%ecx
        xorl    %eax,%edi
        movdqa  %xmm8,%xmm9
-       paddd   %xmm3,%xmm8
        rorl    $7,%ebp
+       paddd   %xmm3,%xmm8
        addl    %edx,%ecx
        pxor    %xmm10,%xmm4
        addl    4(%rsp),%ebx
@@ -1847,43 +2016,43 @@ _ssse3_shortcut:
        movl    %ecx,%esi
        roll    $5,%ecx
        movdqa  %xmm4,%xmm10
-       movdqa  %xmm8,48(%rsp)
        addl    %edi,%ebx
        xorl    %ebp,%esi
+       movdqa  %xmm8,48(%rsp)
        rorl    $7,%edx
        addl    %ecx,%ebx
-       pslld   $2,%xmm4
        addl    8(%rsp),%eax
+       pslld   $2,%xmm4
        xorl    %edx,%esi
-       psrld   $30,%xmm10
        movl    %ebx,%edi
+       psrld   $30,%xmm10
        roll    $5,%ebx
        addl    %esi,%eax
        xorl    %edx,%edi
        rorl    $7,%ecx
-       addl    %ebx,%eax
        por     %xmm10,%xmm4
+       addl    %ebx,%eax
        addl    12(%rsp),%ebp
+       pshufd  $238,%xmm3,%xmm8
        xorl    %ecx,%edi
-       movdqa  %xmm4,%xmm8
        movl    %eax,%esi
        roll    $5,%eax
        addl    %edi,%ebp
        xorl    %ecx,%esi
        rorl    $7,%ebx
        addl    %eax,%ebp
-       addl    16(%rsp),%edx
        pxor    %xmm1,%xmm5
-.byte  102,68,15,58,15,195,8
+       addl    16(%rsp),%edx
        xorl    %ebx,%esi
+       punpcklqdq      %xmm4,%xmm8
        movl    %ebp,%edi
        roll    $5,%ebp
        pxor    %xmm6,%xmm5
        addl    %esi,%edx
        xorl    %ebx,%edi
        movdqa  %xmm9,%xmm10
-       paddd   %xmm4,%xmm9
        rorl    $7,%eax
+       paddd   %xmm4,%xmm9
        addl    %ebp,%edx
        pxor    %xmm8,%xmm5
        addl    20(%rsp),%ecx
@@ -1891,24 +2060,24 @@ _ssse3_shortcut:
        movl    %edx,%esi
        roll    $5,%edx
        movdqa  %xmm5,%xmm8
-       movdqa  %xmm9,0(%rsp)
        addl    %edi,%ecx
        xorl    %eax,%esi
+       movdqa  %xmm9,0(%rsp)
        rorl    $7,%ebp
        addl    %edx,%ecx
-       pslld   $2,%xmm5
        addl    24(%rsp),%ebx
+       pslld   $2,%xmm5
        xorl    %ebp,%esi
-       psrld   $30,%xmm8
        movl    %ecx,%edi
+       psrld   $30,%xmm8
        roll    $5,%ecx
        addl    %esi,%ebx
        xorl    %ebp,%edi
        rorl    $7,%edx
-       addl    %ecx,%ebx
        por     %xmm8,%xmm5
+       addl    %ecx,%ebx
        addl    28(%rsp),%eax
-       movdqa  %xmm5,%xmm9
+       pshufd  $238,%xmm4,%xmm9
        rorl    $7,%ecx
        movl    %ebx,%esi
        xorl    %edx,%edi
@@ -1917,47 +2086,47 @@ _ssse3_shortcut:
        xorl    %ecx,%esi
        xorl    %edx,%ecx
        addl    %ebx,%eax
-       addl    32(%rsp),%ebp
        pxor    %xmm2,%xmm6
-.byte  102,68,15,58,15,204,8
+       addl    32(%rsp),%ebp
        andl    %ecx,%esi
        xorl    %edx,%ecx
        rorl    $7,%ebx
-       pxor    %xmm7,%xmm6
+       punpcklqdq      %xmm5,%xmm9
        movl    %eax,%edi
        xorl    %ecx,%esi
-       movdqa  %xmm10,%xmm8
-       paddd   %xmm5,%xmm10
+       pxor    %xmm7,%xmm6
        roll    $5,%eax
        addl    %esi,%ebp
-       pxor    %xmm9,%xmm6
+       movdqa  %xmm10,%xmm8
        xorl    %ebx,%edi
+       paddd   %xmm5,%xmm10
        xorl    %ecx,%ebx
+       pxor    %xmm9,%xmm6
        addl    %eax,%ebp
        addl    36(%rsp),%edx
-       movdqa  %xmm6,%xmm9
-       movdqa  %xmm10,16(%rsp)
        andl    %ebx,%edi
        xorl    %ecx,%ebx
        rorl    $7,%eax
+       movdqa  %xmm6,%xmm9
        movl    %ebp,%esi
-       pslld   $2,%xmm6
        xorl    %ebx,%edi
+       movdqa  %xmm10,16(%rsp)
        roll    $5,%ebp
-       psrld   $30,%xmm9
        addl    %edi,%edx
        xorl    %eax,%esi
+       pslld   $2,%xmm6
        xorl    %ebx,%eax
        addl    %ebp,%edx
+       psrld   $30,%xmm9
        addl    40(%rsp),%ecx
        andl    %eax,%esi
-       por     %xmm9,%xmm6
        xorl    %ebx,%eax
+       por     %xmm9,%xmm6
        rorl    $7,%ebp
-       movdqa  %xmm6,%xmm10
        movl    %edx,%edi
        xorl    %eax,%esi
        roll    $5,%edx
+       pshufd  $238,%xmm5,%xmm10
        addl    %esi,%ecx
        xorl    %ebp,%edi
        xorl    %eax,%ebp
@@ -1973,47 +2142,47 @@ _ssse3_shortcut:
        xorl    %edx,%esi
        xorl    %ebp,%edx
        addl    %ecx,%ebx
-       addl    48(%rsp),%eax
        pxor    %xmm3,%xmm7
-.byte  102,68,15,58,15,213,8
+       addl    48(%rsp),%eax
        andl    %edx,%esi
        xorl    %ebp,%edx
        rorl    $7,%ecx
-       pxor    %xmm0,%xmm7
+       punpcklqdq      %xmm6,%xmm10
        movl    %ebx,%edi
        xorl    %edx,%esi
-       movdqa  48(%r11),%xmm9
-       paddd   %xmm6,%xmm8
+       pxor    %xmm0,%xmm7
        roll    $5,%ebx
        addl    %esi,%eax
-       pxor    %xmm10,%xmm7
+       movdqa  32(%r14),%xmm9
        xorl    %ecx,%edi
+       paddd   %xmm6,%xmm8
        xorl    %edx,%ecx
+       pxor    %xmm10,%xmm7
        addl    %ebx,%eax
        addl    52(%rsp),%ebp
-       movdqa  %xmm7,%xmm10
-       movdqa  %xmm8,32(%rsp)
        andl    %ecx,%edi
        xorl    %edx,%ecx
        rorl    $7,%ebx
+       movdqa  %xmm7,%xmm10
        movl    %eax,%esi
-       pslld   $2,%xmm7
        xorl    %ecx,%edi
+       movdqa  %xmm8,32(%rsp)
        roll    $5,%eax
-       psrld   $30,%xmm10
        addl    %edi,%ebp
        xorl    %ebx,%esi
+       pslld   $2,%xmm7
        xorl    %ecx,%ebx
        addl    %eax,%ebp
+       psrld   $30,%xmm10
        addl    56(%rsp),%edx
        andl    %ebx,%esi
-       por     %xmm10,%xmm7
        xorl    %ecx,%ebx
+       por     %xmm10,%xmm7
        rorl    $7,%eax
-       movdqa  %xmm7,%xmm8
        movl    %ebp,%edi
        xorl    %ebx,%esi
        roll    $5,%ebp
+       pshufd  $238,%xmm6,%xmm8
        addl    %esi,%edx
        xorl    %eax,%edi
        xorl    %ebx,%eax
@@ -2029,47 +2198,47 @@ _ssse3_shortcut:
        xorl    %ebp,%esi
        xorl    %eax,%ebp
        addl    %edx,%ecx
-       addl    0(%rsp),%ebx
        pxor    %xmm4,%xmm0
-.byte  102,68,15,58,15,198,8
+       addl    0(%rsp),%ebx
        andl    %ebp,%esi
        xorl    %eax,%ebp
        rorl    $7,%edx
-       pxor    %xmm1,%xmm0
+       punpcklqdq      %xmm7,%xmm8
        movl    %ecx,%edi
        xorl    %ebp,%esi
-       movdqa  %xmm9,%xmm10
-       paddd   %xmm7,%xmm9
+       pxor    %xmm1,%xmm0
        roll    $5,%ecx
        addl    %esi,%ebx
-       pxor    %xmm8,%xmm0
+       movdqa  %xmm9,%xmm10
        xorl    %edx,%edi
+       paddd   %xmm7,%xmm9
        xorl    %ebp,%edx
+       pxor    %xmm8,%xmm0
        addl    %ecx,%ebx
        addl    4(%rsp),%eax
-       movdqa  %xmm0,%xmm8
-       movdqa  %xmm9,48(%rsp)
        andl    %edx,%edi
        xorl    %ebp,%edx
        rorl    $7,%ecx
+       movdqa  %xmm0,%xmm8
        movl    %ebx,%esi
-       pslld   $2,%xmm0
        xorl    %edx,%edi
+       movdqa  %xmm9,48(%rsp)
        roll    $5,%ebx
-       psrld   $30,%xmm8
        addl    %edi,%eax
        xorl    %ecx,%esi
+       pslld   $2,%xmm0
        xorl    %edx,%ecx
        addl    %ebx,%eax
+       psrld   $30,%xmm8
        addl    8(%rsp),%ebp
        andl    %ecx,%esi
-       por     %xmm8,%xmm0
        xorl    %edx,%ecx
+       por     %xmm8,%xmm0
        rorl    $7,%ebx
-       movdqa  %xmm0,%xmm9
        movl    %eax,%edi
        xorl    %ecx,%esi
        roll    $5,%eax
+       pshufd  $238,%xmm7,%xmm9
        addl    %esi,%ebp
        xorl    %ebx,%edi
        xorl    %ecx,%ebx
@@ -2085,47 +2254,47 @@ _ssse3_shortcut:
        xorl    %eax,%esi
        xorl    %ebx,%eax
        addl    %ebp,%edx
-       addl    16(%rsp),%ecx
        pxor    %xmm5,%xmm1
-.byte  102,68,15,58,15,207,8
+       addl    16(%rsp),%ecx
        andl    %eax,%esi
        xorl    %ebx,%eax
        rorl    $7,%ebp
-       pxor    %xmm2,%xmm1
+       punpcklqdq      %xmm0,%xmm9
        movl    %edx,%edi
        xorl    %eax,%esi
-       movdqa  %xmm10,%xmm8
-       paddd   %xmm0,%xmm10
+       pxor    %xmm2,%xmm1
        roll    $5,%edx
        addl    %esi,%ecx
-       pxor    %xmm9,%xmm1
+       movdqa  %xmm10,%xmm8
        xorl    %ebp,%edi
+       paddd   %xmm0,%xmm10
        xorl    %eax,%ebp
+       pxor    %xmm9,%xmm1
        addl    %edx,%ecx
        addl    20(%rsp),%ebx
-       movdqa  %xmm1,%xmm9
-       movdqa  %xmm10,0(%rsp)
        andl    %ebp,%edi
        xorl    %eax,%ebp
        rorl    $7,%edx
+       movdqa  %xmm1,%xmm9
        movl    %ecx,%esi
-       pslld   $2,%xmm1
        xorl    %ebp,%edi
+       movdqa  %xmm10,0(%rsp)
        roll    $5,%ecx
-       psrld   $30,%xmm9
        addl    %edi,%ebx
        xorl    %edx,%esi
+       pslld   $2,%xmm1
        xorl    %ebp,%edx
        addl    %ecx,%ebx
+       psrld   $30,%xmm9
        addl    24(%rsp),%eax
        andl    %edx,%esi
-       por     %xmm9,%xmm1
        xorl    %ebp,%edx
+       por     %xmm9,%xmm1
        rorl    $7,%ecx
-       movdqa  %xmm1,%xmm10
        movl    %ebx,%edi
        xorl    %edx,%esi
        roll    $5,%ebx
+       pshufd  $238,%xmm0,%xmm10
        addl    %esi,%eax
        xorl    %ecx,%edi
        xorl    %edx,%ecx
@@ -2141,47 +2310,47 @@ _ssse3_shortcut:
        xorl    %ebx,%esi
        xorl    %ecx,%ebx
        addl    %eax,%ebp
-       addl    32(%rsp),%edx
        pxor    %xmm6,%xmm2
-.byte  102,68,15,58,15,208,8
+       addl    32(%rsp),%edx
        andl    %ebx,%esi
        xorl    %ecx,%ebx
        rorl    $7,%eax
-       pxor    %xmm3,%xmm2
+       punpcklqdq      %xmm1,%xmm10
        movl    %ebp,%edi
        xorl    %ebx,%esi
-       movdqa  %xmm8,%xmm9
-       paddd   %xmm1,%xmm8
+       pxor    %xmm3,%xmm2
        roll    $5,%ebp
        addl    %esi,%edx
-       pxor    %xmm10,%xmm2
+       movdqa  %xmm8,%xmm9
        xorl    %eax,%edi
+       paddd   %xmm1,%xmm8
        xorl    %ebx,%eax
+       pxor    %xmm10,%xmm2
        addl    %ebp,%edx
        addl    36(%rsp),%ecx
-       movdqa  %xmm2,%xmm10
-       movdqa  %xmm8,16(%rsp)
        andl    %eax,%edi
        xorl    %ebx,%eax
        rorl    $7,%ebp
+       movdqa  %xmm2,%xmm10
        movl    %edx,%esi
-       pslld   $2,%xmm2
        xorl    %eax,%edi
+       movdqa  %xmm8,16(%rsp)
        roll    $5,%edx
-       psrld   $30,%xmm10
        addl    %edi,%ecx
        xorl    %ebp,%esi
+       pslld   $2,%xmm2
        xorl    %eax,%ebp
        addl    %edx,%ecx
+       psrld   $30,%xmm10
        addl    40(%rsp),%ebx
        andl    %ebp,%esi
-       por     %xmm10,%xmm2
        xorl    %eax,%ebp
+       por     %xmm10,%xmm2
        rorl    $7,%edx
-       movdqa  %xmm2,%xmm8
        movl    %ecx,%edi
        xorl    %ebp,%esi
        roll    $5,%ecx
+       pshufd  $238,%xmm1,%xmm8
        addl    %esi,%ebx
        xorl    %edx,%edi
        xorl    %ebp,%edx
@@ -2196,18 +2365,18 @@ _ssse3_shortcut:
        addl    %edi,%eax
        xorl    %edx,%esi
        addl    %ebx,%eax
-       addl    48(%rsp),%ebp
        pxor    %xmm7,%xmm3
-.byte  102,68,15,58,15,193,8
+       addl    48(%rsp),%ebp
        xorl    %ecx,%esi
+       punpcklqdq      %xmm2,%xmm8
        movl    %eax,%edi
        roll    $5,%eax
        pxor    %xmm4,%xmm3
        addl    %esi,%ebp
        xorl    %ecx,%edi
        movdqa  %xmm9,%xmm10
-       paddd   %xmm2,%xmm9
        rorl    $7,%ebx
+       paddd   %xmm2,%xmm9
        addl    %eax,%ebp
        pxor    %xmm8,%xmm3
        addl    52(%rsp),%edx
@@ -2215,22 +2384,22 @@ _ssse3_shortcut:
        movl    %ebp,%esi
        roll    $5,%ebp
        movdqa  %xmm3,%xmm8
-       movdqa  %xmm9,32(%rsp)
        addl    %edi,%edx
        xorl    %ebx,%esi
+       movdqa  %xmm9,32(%rsp)
        rorl    $7,%eax
        addl    %ebp,%edx
-       pslld   $2,%xmm3
        addl    56(%rsp),%ecx
+       pslld   $2,%xmm3
        xorl    %eax,%esi
-       psrld   $30,%xmm8
        movl    %edx,%edi
+       psrld   $30,%xmm8
        roll    $5,%edx
        addl    %esi,%ecx
        xorl    %eax,%edi
        rorl    $7,%ebp
-       addl    %edx,%ecx
        por     %xmm8,%xmm3
+       addl    %edx,%ecx
        addl    60(%rsp),%ebx
        xorl    %ebp,%edi
        movl    %ecx,%esi
@@ -2240,13 +2409,13 @@ _ssse3_shortcut:
        rorl    $7,%edx
        addl    %ecx,%ebx
        addl    0(%rsp),%eax
-       paddd   %xmm3,%xmm10
        xorl    %edx,%esi
        movl    %ebx,%edi
        roll    $5,%ebx
+       paddd   %xmm3,%xmm10
        addl    %esi,%eax
-       movdqa  %xmm10,48(%rsp)
        xorl    %edx,%edi
+       movdqa  %xmm10,48(%rsp)
        rorl    $7,%ecx
        addl    %ebx,%eax
        addl    4(%rsp),%ebp
@@ -2275,8 +2444,8 @@ _ssse3_shortcut:
        addl    %edx,%ecx
        cmpq    %r10,%r9
        je      .Ldone_ssse3
-       movdqa  64(%r11),%xmm6
-       movdqa  0(%r11),%xmm9
+       movdqa  64(%r14),%xmm6
+       movdqa  -64(%r14),%xmm9
        movdqu  0(%r9),%xmm0
        movdqu  16(%r9),%xmm1
        movdqu  32(%r9),%xmm2
@@ -2285,23 +2454,23 @@ _ssse3_shortcut:
        addq    $64,%r9
        addl    16(%rsp),%ebx
        xorl    %ebp,%esi
-.byte  102,15,56,0,206
        movl    %ecx,%edi
+.byte  102,15,56,0,206
        roll    $5,%ecx
-       paddd   %xmm9,%xmm0
        addl    %esi,%ebx
        xorl    %ebp,%edi
        rorl    $7,%edx
+       paddd   %xmm9,%xmm0
        addl    %ecx,%ebx
-       movdqa  %xmm0,0(%rsp)
        addl    20(%rsp),%eax
        xorl    %edx,%edi
-       psubd   %xmm9,%xmm0
        movl    %ebx,%esi
+       movdqa  %xmm0,0(%rsp)
        roll    $5,%ebx
        addl    %edi,%eax
        xorl    %edx,%esi
        rorl    $7,%ecx
+       psubd   %xmm9,%xmm0
        addl    %ebx,%eax
        addl    24(%rsp),%ebp
        xorl    %ecx,%esi
@@ -2321,23 +2490,23 @@ _ssse3_shortcut:
        addl    %ebp,%edx
        addl    32(%rsp),%ecx
        xorl    %eax,%esi
-.byte  102,15,56,0,214
        movl    %edx,%edi
+.byte  102,15,56,0,214
        roll    $5,%edx
-       paddd   %xmm9,%xmm1
        addl    %esi,%ecx
        xorl    %eax,%edi
        rorl    $7,%ebp
+       paddd   %xmm9,%xmm1
        addl    %edx,%ecx
-       movdqa  %xmm1,16(%rsp)
        addl    36(%rsp),%ebx
        xorl    %ebp,%edi
-       psubd   %xmm9,%xmm1
        movl    %ecx,%esi
+       movdqa  %xmm1,16(%rsp)
        roll    $5,%ecx
        addl    %edi,%ebx
        xorl    %ebp,%esi
        rorl    $7,%edx
+       psubd   %xmm9,%xmm1
        addl    %ecx,%ebx
        addl    40(%rsp),%eax
        xorl    %edx,%esi
@@ -2345,197 +2514,3079 @@ _ssse3_shortcut:
        roll    $5,%ebx
        addl    %esi,%eax
        xorl    %edx,%edi
-       rorl    $7,%ecx
-       addl    %ebx,%eax
-       addl    44(%rsp),%ebp
+       rorl    $7,%ecx
+       addl    %ebx,%eax
+       addl    44(%rsp),%ebp
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       roll    $5,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       rorl    $7,%ebx
+       addl    %eax,%ebp
+       addl    48(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+.byte  102,15,56,0,222
+       roll    $5,%ebp
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       rorl    $7,%eax
+       paddd   %xmm9,%xmm2
+       addl    %ebp,%edx
+       addl    52(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       movdqa  %xmm2,32(%rsp)
+       roll    $5,%edx
+       addl    %edi,%ecx
+       xorl    %eax,%esi
+       rorl    $7,%ebp
+       psubd   %xmm9,%xmm2
+       addl    %edx,%ecx
+       addl    56(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       roll    $5,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       rorl    $7,%edx
+       addl    %ecx,%ebx
+       addl    60(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       roll    $5,%ebx
+       addl    %edi,%eax
+       rorl    $7,%ecx
+       addl    %ebx,%eax
+       addl    0(%r8),%eax
+       addl    4(%r8),%esi
+       addl    8(%r8),%ecx
+       addl    12(%r8),%edx
+       movl    %eax,0(%r8)
+       addl    16(%r8),%ebp
+       movl    %esi,4(%r8)
+       movl    %esi,%ebx
+       movl    %ecx,8(%r8)
+       movl    %ecx,%edi
+       movl    %edx,12(%r8)
+       xorl    %edx,%edi
+       movl    %ebp,16(%r8)
+       andl    %edi,%esi
+       jmp     .Loop_ssse3
+
+.p2align       4
+.Ldone_ssse3:
+       addl    16(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       roll    $5,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       rorl    $7,%edx
+       addl    %ecx,%ebx
+       addl    20(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       roll    $5,%ebx
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       rorl    $7,%ecx
+       addl    %ebx,%eax
+       addl    24(%rsp),%ebp
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       roll    $5,%eax
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       rorl    $7,%ebx
+       addl    %eax,%ebp
+       addl    28(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       roll    $5,%ebp
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       rorl    $7,%eax
+       addl    %ebp,%edx
+       addl    32(%rsp),%ecx
+       xorl    %eax,%esi
+       movl    %edx,%edi
+       roll    $5,%edx
+       addl    %esi,%ecx
+       xorl    %eax,%edi
+       rorl    $7,%ebp
+       addl    %edx,%ecx
+       addl    36(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       roll    $5,%ecx
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       rorl    $7,%edx
+       addl    %ecx,%ebx
+       addl    40(%rsp),%eax
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       roll    $5,%ebx
+       addl    %esi,%eax
+       xorl    %edx,%edi
+       rorl    $7,%ecx
+       addl    %ebx,%eax
+       addl    44(%rsp),%ebp
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       roll    $5,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       rorl    $7,%ebx
+       addl    %eax,%ebp
+       addl    48(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+       roll    $5,%ebp
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       rorl    $7,%eax
+       addl    %ebp,%edx
+       addl    52(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       roll    $5,%edx
+       addl    %edi,%ecx
+       xorl    %eax,%esi
+       rorl    $7,%ebp
+       addl    %edx,%ecx
+       addl    56(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       roll    $5,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       rorl    $7,%edx
+       addl    %ecx,%ebx
+       addl    60(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       roll    $5,%ebx
+       addl    %edi,%eax
+       rorl    $7,%ecx
+       addl    %ebx,%eax
+       addl    0(%r8),%eax
+       addl    4(%r8),%esi
+       addl    8(%r8),%ecx
+       movl    %eax,0(%r8)
+       addl    12(%r8),%edx
+       movl    %esi,4(%r8)
+       addl    16(%r8),%ebp
+       movl    %ecx,8(%r8)
+       movl    %edx,12(%r8)
+       movl    %ebp,16(%r8)
+       movaps  -40-96(%r11),%xmm6
+       movaps  -40-80(%r11),%xmm7
+       movaps  -40-64(%r11),%xmm8
+       movaps  -40-48(%r11),%xmm9
+       movaps  -40-32(%r11),%xmm10
+       movaps  -40-16(%r11),%xmm11
+       movq    -40(%r11),%r14
+
+       movq    -32(%r11),%r13
+
+       movq    -24(%r11),%r12
+
+       movq    -16(%r11),%rbp
+
+       movq    -8(%r11),%rbx
+
+       leaq    (%r11),%rsp
+
+.Lepilogue_ssse3:
+       movq    8(%rsp),%rdi
+       movq    16(%rsp),%rsi
+       .byte   0xf3,0xc3
+
+.LSEH_end_sha1_block_data_order_ssse3:
+.def   sha1_block_data_order_avx;      .scl 3; .type 32;       .endef
+.p2align       4
+sha1_block_data_order_avx:
+       movq    %rdi,8(%rsp)
+       movq    %rsi,16(%rsp)
+       movq    %rsp,%rax
+.LSEH_begin_sha1_block_data_order_avx:
+       movq    %rcx,%rdi
+       movq    %rdx,%rsi
+       movq    %r8,%rdx
+
+_avx_shortcut:
+
+       movq    %rsp,%r11
+
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       leaq    -160(%rsp),%rsp
+       vzeroupper
+       vmovaps %xmm6,-40-96(%r11)
+       vmovaps %xmm7,-40-80(%r11)
+       vmovaps %xmm8,-40-64(%r11)
+       vmovaps %xmm9,-40-48(%r11)
+       vmovaps %xmm10,-40-32(%r11)
+       vmovaps %xmm11,-40-16(%r11)
+.Lprologue_avx:
+       andq    $-64,%rsp
+       movq    %rdi,%r8
+       movq    %rsi,%r9
+       movq    %rdx,%r10
+
+       shlq    $6,%r10
+       addq    %r9,%r10
+       leaq    K_XX_XX+64(%rip),%r14
+
+       movl    0(%r8),%eax
+       movl    4(%r8),%ebx
+       movl    8(%r8),%ecx
+       movl    12(%r8),%edx
+       movl    %ebx,%esi
+       movl    16(%r8),%ebp
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       andl    %edi,%esi
+
+       vmovdqa 64(%r14),%xmm6
+       vmovdqa -64(%r14),%xmm11
+       vmovdqu 0(%r9),%xmm0
+       vmovdqu 16(%r9),%xmm1
+       vmovdqu 32(%r9),%xmm2
+       vmovdqu 48(%r9),%xmm3
+       vpshufb %xmm6,%xmm0,%xmm0
+       addq    $64,%r9
+       vpshufb %xmm6,%xmm1,%xmm1
+       vpshufb %xmm6,%xmm2,%xmm2
+       vpshufb %xmm6,%xmm3,%xmm3
+       vpaddd  %xmm11,%xmm0,%xmm4
+       vpaddd  %xmm11,%xmm1,%xmm5
+       vpaddd  %xmm11,%xmm2,%xmm6
+       vmovdqa %xmm4,0(%rsp)
+       vmovdqa %xmm5,16(%rsp)
+       vmovdqa %xmm6,32(%rsp)
+       jmp     .Loop_avx
+.p2align       4
+.Loop_avx:
+       shrdl   $2,%ebx,%ebx
+       xorl    %edx,%esi
+       vpalignr        $8,%xmm0,%xmm1,%xmm4
+       movl    %eax,%edi
+       addl    0(%rsp),%ebp
+       vpaddd  %xmm3,%xmm11,%xmm9
+       xorl    %ecx,%ebx
+       shldl   $5,%eax,%eax
+       vpsrldq $4,%xmm3,%xmm8
+       addl    %esi,%ebp
+       andl    %ebx,%edi
+       vpxor   %xmm0,%xmm4,%xmm4
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       vpxor   %xmm2,%xmm8,%xmm8
+       shrdl   $7,%eax,%eax
+       xorl    %ecx,%edi
+       movl    %ebp,%esi
+       addl    4(%rsp),%edx
+       vpxor   %xmm8,%xmm4,%xmm4
+       xorl    %ebx,%eax
+       shldl   $5,%ebp,%ebp
+       vmovdqa %xmm9,48(%rsp)
+       addl    %edi,%edx
+       andl    %eax,%esi
+       vpsrld  $31,%xmm4,%xmm8
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       shrdl   $7,%ebp,%ebp
+       xorl    %ebx,%esi
+       vpslldq $12,%xmm4,%xmm10
+       vpaddd  %xmm4,%xmm4,%xmm4
+       movl    %edx,%edi
+       addl    8(%rsp),%ecx
+       xorl    %eax,%ebp
+       shldl   $5,%edx,%edx
+       vpsrld  $30,%xmm10,%xmm9
+       vpor    %xmm8,%xmm4,%xmm4
+       addl    %esi,%ecx
+       andl    %ebp,%edi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       vpslld  $2,%xmm10,%xmm10
+       vpxor   %xmm9,%xmm4,%xmm4
+       shrdl   $7,%edx,%edx
+       xorl    %eax,%edi
+       movl    %ecx,%esi
+       addl    12(%rsp),%ebx
+       vpxor   %xmm10,%xmm4,%xmm4
+       xorl    %ebp,%edx
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       andl    %edx,%esi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       shrdl   $7,%ecx,%ecx
+       xorl    %ebp,%esi
+       vpalignr        $8,%xmm1,%xmm2,%xmm5
+       movl    %ebx,%edi
+       addl    16(%rsp),%eax
+       vpaddd  %xmm4,%xmm11,%xmm9
+       xorl    %edx,%ecx
+       shldl   $5,%ebx,%ebx
+       vpsrldq $4,%xmm4,%xmm8
+       addl    %esi,%eax
+       andl    %ecx,%edi
+       vpxor   %xmm1,%xmm5,%xmm5
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       vpxor   %xmm3,%xmm8,%xmm8
+       shrdl   $7,%ebx,%ebx
+       xorl    %edx,%edi
+       movl    %eax,%esi
+       addl    20(%rsp),%ebp
+       vpxor   %xmm8,%xmm5,%xmm5
+       xorl    %ecx,%ebx
+       shldl   $5,%eax,%eax
+       vmovdqa %xmm9,0(%rsp)
+       addl    %edi,%ebp
+       andl    %ebx,%esi
+       vpsrld  $31,%xmm5,%xmm8
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       shrdl   $7,%eax,%eax
+       xorl    %ecx,%esi
+       vpslldq $12,%xmm5,%xmm10
+       vpaddd  %xmm5,%xmm5,%xmm5
+       movl    %ebp,%edi
+       addl    24(%rsp),%edx
+       xorl    %ebx,%eax
+       shldl   $5,%ebp,%ebp
+       vpsrld  $30,%xmm10,%xmm9
+       vpor    %xmm8,%xmm5,%xmm5
+       addl    %esi,%edx
+       andl    %eax,%edi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       vpslld  $2,%xmm10,%xmm10
+       vpxor   %xmm9,%xmm5,%xmm5
+       shrdl   $7,%ebp,%ebp
+       xorl    %ebx,%edi
+       movl    %edx,%esi
+       addl    28(%rsp),%ecx
+       vpxor   %xmm10,%xmm5,%xmm5
+       xorl    %eax,%ebp
+       shldl   $5,%edx,%edx
+       vmovdqa -32(%r14),%xmm11
+       addl    %edi,%ecx
+       andl    %ebp,%esi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       shrdl   $7,%edx,%edx
+       xorl    %eax,%esi
+       vpalignr        $8,%xmm2,%xmm3,%xmm6
+       movl    %ecx,%edi
+       addl    32(%rsp),%ebx
+       vpaddd  %xmm5,%xmm11,%xmm9
+       xorl    %ebp,%edx
+       shldl   $5,%ecx,%ecx
+       vpsrldq $4,%xmm5,%xmm8
+       addl    %esi,%ebx
+       andl    %edx,%edi
+       vpxor   %xmm2,%xmm6,%xmm6
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       vpxor   %xmm4,%xmm8,%xmm8
+       shrdl   $7,%ecx,%ecx
+       xorl    %ebp,%edi
+       movl    %ebx,%esi
+       addl    36(%rsp),%eax
+       vpxor   %xmm8,%xmm6,%xmm6
+       xorl    %edx,%ecx
+       shldl   $5,%ebx,%ebx
+       vmovdqa %xmm9,16(%rsp)
+       addl    %edi,%eax
+       andl    %ecx,%esi
+       vpsrld  $31,%xmm6,%xmm8
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       shrdl   $7,%ebx,%ebx
+       xorl    %edx,%esi
+       vpslldq $12,%xmm6,%xmm10
+       vpaddd  %xmm6,%xmm6,%xmm6
+       movl    %eax,%edi
+       addl    40(%rsp),%ebp
+       xorl    %ecx,%ebx
+       shldl   $5,%eax,%eax
+       vpsrld  $30,%xmm10,%xmm9
+       vpor    %xmm8,%xmm6,%xmm6
+       addl    %esi,%ebp
+       andl    %ebx,%edi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       vpslld  $2,%xmm10,%xmm10
+       vpxor   %xmm9,%xmm6,%xmm6
+       shrdl   $7,%eax,%eax
+       xorl    %ecx,%edi
+       movl    %ebp,%esi
+       addl    44(%rsp),%edx
+       vpxor   %xmm10,%xmm6,%xmm6
+       xorl    %ebx,%eax
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       andl    %eax,%esi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       shrdl   $7,%ebp,%ebp
+       xorl    %ebx,%esi
+       vpalignr        $8,%xmm3,%xmm4,%xmm7
+       movl    %edx,%edi
+       addl    48(%rsp),%ecx
+       vpaddd  %xmm6,%xmm11,%xmm9
+       xorl    %eax,%ebp
+       shldl   $5,%edx,%edx
+       vpsrldq $4,%xmm6,%xmm8
+       addl    %esi,%ecx
+       andl    %ebp,%edi
+       vpxor   %xmm3,%xmm7,%xmm7
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       vpxor   %xmm5,%xmm8,%xmm8
+       shrdl   $7,%edx,%edx
+       xorl    %eax,%edi
+       movl    %ecx,%esi
+       addl    52(%rsp),%ebx
+       vpxor   %xmm8,%xmm7,%xmm7
+       xorl    %ebp,%edx
+       shldl   $5,%ecx,%ecx
+       vmovdqa %xmm9,32(%rsp)
+       addl    %edi,%ebx
+       andl    %edx,%esi
+       vpsrld  $31,%xmm7,%xmm8
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       shrdl   $7,%ecx,%ecx
+       xorl    %ebp,%esi
+       vpslldq $12,%xmm7,%xmm10
+       vpaddd  %xmm7,%xmm7,%xmm7
+       movl    %ebx,%edi
+       addl    56(%rsp),%eax
+       xorl    %edx,%ecx
+       shldl   $5,%ebx,%ebx
+       vpsrld  $30,%xmm10,%xmm9
+       vpor    %xmm8,%xmm7,%xmm7
+       addl    %esi,%eax
+       andl    %ecx,%edi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       vpslld  $2,%xmm10,%xmm10
+       vpxor   %xmm9,%xmm7,%xmm7
+       shrdl   $7,%ebx,%ebx
+       xorl    %edx,%edi
+       movl    %eax,%esi
+       addl    60(%rsp),%ebp
+       vpxor   %xmm10,%xmm7,%xmm7
+       xorl    %ecx,%ebx
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       andl    %ebx,%esi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       vpalignr        $8,%xmm6,%xmm7,%xmm8
+       vpxor   %xmm4,%xmm0,%xmm0
+       shrdl   $7,%eax,%eax
+       xorl    %ecx,%esi
+       movl    %ebp,%edi
+       addl    0(%rsp),%edx
+       vpxor   %xmm1,%xmm0,%xmm0
+       xorl    %ebx,%eax
+       shldl   $5,%ebp,%ebp
+       vpaddd  %xmm7,%xmm11,%xmm9
+       addl    %esi,%edx
+       andl    %eax,%edi
+       vpxor   %xmm8,%xmm0,%xmm0
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       shrdl   $7,%ebp,%ebp
+       xorl    %ebx,%edi
+       vpsrld  $30,%xmm0,%xmm8
+       vmovdqa %xmm9,48(%rsp)
+       movl    %edx,%esi
+       addl    4(%rsp),%ecx
+       xorl    %eax,%ebp
+       shldl   $5,%edx,%edx
+       vpslld  $2,%xmm0,%xmm0
+       addl    %edi,%ecx
+       andl    %ebp,%esi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       shrdl   $7,%edx,%edx
+       xorl    %eax,%esi
+       movl    %ecx,%edi
+       addl    8(%rsp),%ebx
+       vpor    %xmm8,%xmm0,%xmm0
+       xorl    %ebp,%edx
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       andl    %edx,%edi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       addl    12(%rsp),%eax
+       xorl    %ebp,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vpalignr        $8,%xmm7,%xmm0,%xmm8
+       vpxor   %xmm5,%xmm1,%xmm1
+       addl    16(%rsp),%ebp
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       vpxor   %xmm2,%xmm1,%xmm1
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       vpaddd  %xmm0,%xmm11,%xmm9
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpxor   %xmm8,%xmm1,%xmm1
+       addl    20(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       vpsrld  $30,%xmm1,%xmm8
+       vmovdqa %xmm9,0(%rsp)
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpslld  $2,%xmm1,%xmm1
+       addl    24(%rsp),%ecx
+       xorl    %eax,%esi
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       xorl    %eax,%edi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpor    %xmm8,%xmm1,%xmm1
+       addl    28(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vpalignr        $8,%xmm0,%xmm1,%xmm8
+       vpxor   %xmm6,%xmm2,%xmm2
+       addl    32(%rsp),%eax
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       vpxor   %xmm3,%xmm2,%xmm2
+       addl    %esi,%eax
+       xorl    %edx,%edi
+       vpaddd  %xmm1,%xmm11,%xmm9
+       vmovdqa 0(%r14),%xmm11
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vpxor   %xmm8,%xmm2,%xmm2
+       addl    36(%rsp),%ebp
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       vpsrld  $30,%xmm2,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpslld  $2,%xmm2,%xmm2
+       addl    40(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpor    %xmm8,%xmm2,%xmm2
+       addl    44(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpalignr        $8,%xmm1,%xmm2,%xmm8
+       vpxor   %xmm7,%xmm3,%xmm3
+       addl    48(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       vpxor   %xmm4,%xmm3,%xmm3
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       vpaddd  %xmm2,%xmm11,%xmm9
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vpxor   %xmm8,%xmm3,%xmm3
+       addl    52(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       vpsrld  $30,%xmm3,%xmm8
+       vmovdqa %xmm9,32(%rsp)
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vpslld  $2,%xmm3,%xmm3
+       addl    56(%rsp),%ebp
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpor    %xmm8,%xmm3,%xmm3
+       addl    60(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpalignr        $8,%xmm2,%xmm3,%xmm8
+       vpxor   %xmm0,%xmm4,%xmm4
+       addl    0(%rsp),%ecx
+       xorl    %eax,%esi
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       vpxor   %xmm5,%xmm4,%xmm4
+       addl    %esi,%ecx
+       xorl    %eax,%edi
+       vpaddd  %xmm3,%xmm11,%xmm9
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpxor   %xmm8,%xmm4,%xmm4
+       addl    4(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       vpsrld  $30,%xmm4,%xmm8
+       vmovdqa %xmm9,48(%rsp)
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vpslld  $2,%xmm4,%xmm4
+       addl    8(%rsp),%eax
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       xorl    %edx,%edi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vpor    %xmm8,%xmm4,%xmm4
+       addl    12(%rsp),%ebp
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpalignr        $8,%xmm3,%xmm4,%xmm8
+       vpxor   %xmm1,%xmm5,%xmm5
+       addl    16(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       vpxor   %xmm6,%xmm5,%xmm5
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       vpaddd  %xmm4,%xmm11,%xmm9
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpxor   %xmm8,%xmm5,%xmm5
+       addl    20(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       vpsrld  $30,%xmm5,%xmm8
+       vmovdqa %xmm9,0(%rsp)
+       addl    %edi,%ecx
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpslld  $2,%xmm5,%xmm5
+       addl    24(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vpor    %xmm8,%xmm5,%xmm5
+       addl    28(%rsp),%eax
+       shrdl   $7,%ecx,%ecx
+       movl    %ebx,%esi
+       xorl    %edx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %ecx,%esi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       vpalignr        $8,%xmm4,%xmm5,%xmm8
+       vpxor   %xmm2,%xmm6,%xmm6
+       addl    32(%rsp),%ebp
+       andl    %ecx,%esi
+       xorl    %edx,%ecx
+       shrdl   $7,%ebx,%ebx
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %eax,%edi
+       xorl    %ecx,%esi
+       vpaddd  %xmm5,%xmm11,%xmm9
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       vpxor   %xmm8,%xmm6,%xmm6
+       xorl    %ebx,%edi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       addl    36(%rsp),%edx
+       vpsrld  $30,%xmm6,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       andl    %ebx,%edi
+       xorl    %ecx,%ebx
+       shrdl   $7,%eax,%eax
+       movl    %ebp,%esi
+       vpslld  $2,%xmm6,%xmm6
+       xorl    %ebx,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %eax,%esi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       addl    40(%rsp),%ecx
+       andl    %eax,%esi
+       vpor    %xmm8,%xmm6,%xmm6
+       xorl    %ebx,%eax
+       shrdl   $7,%ebp,%ebp
+       movl    %edx,%edi
+       xorl    %eax,%esi
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       xorl    %ebp,%edi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       addl    44(%rsp),%ebx
+       andl    %ebp,%edi
+       xorl    %eax,%ebp
+       shrdl   $7,%edx,%edx
+       movl    %ecx,%esi
+       xorl    %ebp,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %edx,%esi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       vpalignr        $8,%xmm5,%xmm6,%xmm8
+       vpxor   %xmm3,%xmm7,%xmm7
+       addl    48(%rsp),%eax
+       andl    %edx,%esi
+       xorl    %ebp,%edx
+       shrdl   $7,%ecx,%ecx
+       vpxor   %xmm0,%xmm7,%xmm7
+       movl    %ebx,%edi
+       xorl    %edx,%esi
+       vpaddd  %xmm6,%xmm11,%xmm9
+       vmovdqa 32(%r14),%xmm11
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       vpxor   %xmm8,%xmm7,%xmm7
+       xorl    %ecx,%edi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       addl    52(%rsp),%ebp
+       vpsrld  $30,%xmm7,%xmm8
+       vmovdqa %xmm9,32(%rsp)
+       andl    %ecx,%edi
+       xorl    %edx,%ecx
+       shrdl   $7,%ebx,%ebx
+       movl    %eax,%esi
+       vpslld  $2,%xmm7,%xmm7
+       xorl    %ecx,%edi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ebx,%esi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       addl    56(%rsp),%edx
+       andl    %ebx,%esi
+       vpor    %xmm8,%xmm7,%xmm7
+       xorl    %ecx,%ebx
+       shrdl   $7,%eax,%eax
+       movl    %ebp,%edi
+       xorl    %ebx,%esi
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       xorl    %eax,%edi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       addl    60(%rsp),%ecx
+       andl    %eax,%edi
+       xorl    %ebx,%eax
+       shrdl   $7,%ebp,%ebp
+       movl    %edx,%esi
+       xorl    %eax,%edi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %ebp,%esi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       vpalignr        $8,%xmm6,%xmm7,%xmm8
+       vpxor   %xmm4,%xmm0,%xmm0
+       addl    0(%rsp),%ebx
+       andl    %ebp,%esi
+       xorl    %eax,%ebp
+       shrdl   $7,%edx,%edx
+       vpxor   %xmm1,%xmm0,%xmm0
+       movl    %ecx,%edi
+       xorl    %ebp,%esi
+       vpaddd  %xmm7,%xmm11,%xmm9
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       vpxor   %xmm8,%xmm0,%xmm0
+       xorl    %edx,%edi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       addl    4(%rsp),%eax
+       vpsrld  $30,%xmm0,%xmm8
+       vmovdqa %xmm9,48(%rsp)
+       andl    %edx,%edi
+       xorl    %ebp,%edx
+       shrdl   $7,%ecx,%ecx
+       movl    %ebx,%esi
+       vpslld  $2,%xmm0,%xmm0
+       xorl    %edx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %ecx,%esi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       addl    8(%rsp),%ebp
+       andl    %ecx,%esi
+       vpor    %xmm8,%xmm0,%xmm0
+       xorl    %edx,%ecx
+       shrdl   $7,%ebx,%ebx
+       movl    %eax,%edi
+       xorl    %ecx,%esi
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       xorl    %ebx,%edi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       addl    12(%rsp),%edx
+       andl    %ebx,%edi
+       xorl    %ecx,%ebx
+       shrdl   $7,%eax,%eax
+       movl    %ebp,%esi
+       xorl    %ebx,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %eax,%esi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       vpalignr        $8,%xmm7,%xmm0,%xmm8
+       vpxor   %xmm5,%xmm1,%xmm1
+       addl    16(%rsp),%ecx
+       andl    %eax,%esi
+       xorl    %ebx,%eax
+       shrdl   $7,%ebp,%ebp
+       vpxor   %xmm2,%xmm1,%xmm1
+       movl    %edx,%edi
+       xorl    %eax,%esi
+       vpaddd  %xmm0,%xmm11,%xmm9
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       vpxor   %xmm8,%xmm1,%xmm1
+       xorl    %ebp,%edi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       addl    20(%rsp),%ebx
+       vpsrld  $30,%xmm1,%xmm8
+       vmovdqa %xmm9,0(%rsp)
+       andl    %ebp,%edi
+       xorl    %eax,%ebp
+       shrdl   $7,%edx,%edx
+       movl    %ecx,%esi
+       vpslld  $2,%xmm1,%xmm1
+       xorl    %ebp,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %edx,%esi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       addl    24(%rsp),%eax
+       andl    %edx,%esi
+       vpor    %xmm8,%xmm1,%xmm1
+       xorl    %ebp,%edx
+       shrdl   $7,%ecx,%ecx
+       movl    %ebx,%edi
+       xorl    %edx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       xorl    %ecx,%edi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       addl    28(%rsp),%ebp
+       andl    %ecx,%edi
+       xorl    %edx,%ecx
+       shrdl   $7,%ebx,%ebx
+       movl    %eax,%esi
+       xorl    %ecx,%edi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ebx,%esi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       vpalignr        $8,%xmm0,%xmm1,%xmm8
+       vpxor   %xmm6,%xmm2,%xmm2
+       addl    32(%rsp),%edx
+       andl    %ebx,%esi
+       xorl    %ecx,%ebx
+       shrdl   $7,%eax,%eax
+       vpxor   %xmm3,%xmm2,%xmm2
+       movl    %ebp,%edi
+       xorl    %ebx,%esi
+       vpaddd  %xmm1,%xmm11,%xmm9
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       vpxor   %xmm8,%xmm2,%xmm2
+       xorl    %eax,%edi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       addl    36(%rsp),%ecx
+       vpsrld  $30,%xmm2,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       andl    %eax,%edi
+       xorl    %ebx,%eax
+       shrdl   $7,%ebp,%ebp
+       movl    %edx,%esi
+       vpslld  $2,%xmm2,%xmm2
+       xorl    %eax,%edi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %ebp,%esi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       addl    40(%rsp),%ebx
+       andl    %ebp,%esi
+       vpor    %xmm8,%xmm2,%xmm2
+       xorl    %eax,%ebp
+       shrdl   $7,%edx,%edx
+       movl    %ecx,%edi
+       xorl    %ebp,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %edx,%edi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       addl    44(%rsp),%eax
+       andl    %edx,%edi
+       xorl    %ebp,%edx
+       shrdl   $7,%ecx,%ecx
+       movl    %ebx,%esi
+       xorl    %edx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       addl    %ebx,%eax
+       vpalignr        $8,%xmm1,%xmm2,%xmm8
+       vpxor   %xmm7,%xmm3,%xmm3
+       addl    48(%rsp),%ebp
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       vpxor   %xmm4,%xmm3,%xmm3
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       vpaddd  %xmm2,%xmm11,%xmm9
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpxor   %xmm8,%xmm3,%xmm3
+       addl    52(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       vpsrld  $30,%xmm3,%xmm8
+       vmovdqa %xmm9,32(%rsp)
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpslld  $2,%xmm3,%xmm3
+       addl    56(%rsp),%ecx
+       xorl    %eax,%esi
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       xorl    %eax,%edi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpor    %xmm8,%xmm3,%xmm3
+       addl    60(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    0(%rsp),%eax
+       vpaddd  %xmm3,%xmm11,%xmm9
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       vmovdqa %xmm9,48(%rsp)
+       xorl    %edx,%edi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    4(%rsp),%ebp
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    8(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       addl    12(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       cmpq    %r10,%r9
+       je      .Ldone_avx
+       vmovdqa 64(%r14),%xmm6
+       vmovdqa -64(%r14),%xmm11
+       vmovdqu 0(%r9),%xmm0
+       vmovdqu 16(%r9),%xmm1
+       vmovdqu 32(%r9),%xmm2
+       vmovdqu 48(%r9),%xmm3
+       vpshufb %xmm6,%xmm0,%xmm0
+       addq    $64,%r9
+       addl    16(%rsp),%ebx
+       xorl    %ebp,%esi
+       vpshufb %xmm6,%xmm1,%xmm1
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       vpaddd  %xmm11,%xmm0,%xmm4
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vmovdqa %xmm4,0(%rsp)
+       addl    20(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    24(%rsp),%ebp
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    28(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       addl    32(%rsp),%ecx
+       xorl    %eax,%esi
+       vpshufb %xmm6,%xmm2,%xmm2
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       vpaddd  %xmm11,%xmm1,%xmm5
+       addl    %esi,%ecx
+       xorl    %eax,%edi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vmovdqa %xmm5,16(%rsp)
+       addl    36(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    40(%rsp),%eax
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       xorl    %edx,%edi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    44(%rsp),%ebp
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    48(%rsp),%edx
+       xorl    %ebx,%esi
+       vpshufb %xmm6,%xmm3,%xmm3
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       vpaddd  %xmm11,%xmm2,%xmm6
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vmovdqa %xmm6,32(%rsp)
+       addl    52(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       addl    56(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    60(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    0(%r8),%eax
+       addl    4(%r8),%esi
+       addl    8(%r8),%ecx
+       addl    12(%r8),%edx
+       movl    %eax,0(%r8)
+       addl    16(%r8),%ebp
+       movl    %esi,4(%r8)
+       movl    %esi,%ebx
+       movl    %ecx,8(%r8)
+       movl    %ecx,%edi
+       movl    %edx,12(%r8)
+       xorl    %edx,%edi
+       movl    %ebp,16(%r8)
+       andl    %edi,%esi
+       jmp     .Loop_avx
+
+.p2align       4
+.Ldone_avx:
+       addl    16(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    20(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    24(%rsp),%ebp
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    28(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       addl    32(%rsp),%ecx
+       xorl    %eax,%esi
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       xorl    %eax,%edi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       addl    36(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    40(%rsp),%eax
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       xorl    %edx,%edi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    44(%rsp),%ebp
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    48(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       addl    52(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       addl    56(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    60(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vzeroupper
+
+       addl    0(%r8),%eax
+       addl    4(%r8),%esi
+       addl    8(%r8),%ecx
+       movl    %eax,0(%r8)
+       addl    12(%r8),%edx
+       movl    %esi,4(%r8)
+       addl    16(%r8),%ebp
+       movl    %ecx,8(%r8)
+       movl    %edx,12(%r8)
+       movl    %ebp,16(%r8)
+       movaps  -40-96(%r11),%xmm6
+       movaps  -40-80(%r11),%xmm7
+       movaps  -40-64(%r11),%xmm8
+       movaps  -40-48(%r11),%xmm9
+       movaps  -40-32(%r11),%xmm10
+       movaps  -40-16(%r11),%xmm11
+       movq    -40(%r11),%r14
+
+       movq    -32(%r11),%r13
+
+       movq    -24(%r11),%r12
+
+       movq    -16(%r11),%rbp
+
+       movq    -8(%r11),%rbx
+
+       leaq    (%r11),%rsp
+
+.Lepilogue_avx:
+       movq    8(%rsp),%rdi
+       movq    16(%rsp),%rsi
+       .byte   0xf3,0xc3
+
+.LSEH_end_sha1_block_data_order_avx:
+.def   sha1_block_data_order_avx2;     .scl 3; .type 32;       .endef
+.p2align       4
+sha1_block_data_order_avx2:
+       movq    %rdi,8(%rsp)
+       movq    %rsi,16(%rsp)
+       movq    %rsp,%rax
+.LSEH_begin_sha1_block_data_order_avx2:
+       movq    %rcx,%rdi
+       movq    %rdx,%rsi
+       movq    %r8,%rdx
+
+_avx2_shortcut:
+
+       movq    %rsp,%r11
+
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       vzeroupper
+       leaq    -96(%rsp),%rsp
+       vmovaps %xmm6,-40-96(%r11)
+       vmovaps %xmm7,-40-80(%r11)
+       vmovaps %xmm8,-40-64(%r11)
+       vmovaps %xmm9,-40-48(%r11)
+       vmovaps %xmm10,-40-32(%r11)
+       vmovaps %xmm11,-40-16(%r11)
+.Lprologue_avx2:
+       movq    %rdi,%r8
+       movq    %rsi,%r9
+       movq    %rdx,%r10
+
+       leaq    -640(%rsp),%rsp
+       shlq    $6,%r10
+       leaq    64(%r9),%r13
+       andq    $-128,%rsp
+       addq    %r9,%r10
+       leaq    K_XX_XX+64(%rip),%r14
+
+       movl    0(%r8),%eax
+       cmpq    %r10,%r13
+       cmovaeq %r9,%r13
+       movl    4(%r8),%ebp
+       movl    8(%r8),%ecx
+       movl    12(%r8),%edx
+       movl    16(%r8),%esi
+       vmovdqu 64(%r14),%ymm6
+
+       vmovdqu (%r9),%xmm0
+       vmovdqu 16(%r9),%xmm1
+       vmovdqu 32(%r9),%xmm2
+       vmovdqu 48(%r9),%xmm3
+       leaq    64(%r9),%r9
+       vinserti128     $1,(%r13),%ymm0,%ymm0
+       vinserti128     $1,16(%r13),%ymm1,%ymm1
+       vpshufb %ymm6,%ymm0,%ymm0
+       vinserti128     $1,32(%r13),%ymm2,%ymm2
+       vpshufb %ymm6,%ymm1,%ymm1
+       vinserti128     $1,48(%r13),%ymm3,%ymm3
+       vpshufb %ymm6,%ymm2,%ymm2
+       vmovdqu -64(%r14),%ymm11
+       vpshufb %ymm6,%ymm3,%ymm3
+
+       vpaddd  %ymm11,%ymm0,%ymm4
+       vpaddd  %ymm11,%ymm1,%ymm5
+       vmovdqu %ymm4,0(%rsp)
+       vpaddd  %ymm11,%ymm2,%ymm6
+       vmovdqu %ymm5,32(%rsp)
+       vpaddd  %ymm11,%ymm3,%ymm7
+       vmovdqu %ymm6,64(%rsp)
+       vmovdqu %ymm7,96(%rsp)
+       vpalignr        $8,%ymm0,%ymm1,%ymm4
+       vpsrldq $4,%ymm3,%ymm8
+       vpxor   %ymm0,%ymm4,%ymm4
+       vpxor   %ymm2,%ymm8,%ymm8
+       vpxor   %ymm8,%ymm4,%ymm4
+       vpsrld  $31,%ymm4,%ymm8
+       vpslldq $12,%ymm4,%ymm10
+       vpaddd  %ymm4,%ymm4,%ymm4
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm4,%ymm4
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm4,%ymm4
+       vpxor   %ymm10,%ymm4,%ymm4
+       vpaddd  %ymm11,%ymm4,%ymm9
+       vmovdqu %ymm9,128(%rsp)
+       vpalignr        $8,%ymm1,%ymm2,%ymm5
+       vpsrldq $4,%ymm4,%ymm8
+       vpxor   %ymm1,%ymm5,%ymm5
+       vpxor   %ymm3,%ymm8,%ymm8
+       vpxor   %ymm8,%ymm5,%ymm5
+       vpsrld  $31,%ymm5,%ymm8
+       vmovdqu -32(%r14),%ymm11
+       vpslldq $12,%ymm5,%ymm10
+       vpaddd  %ymm5,%ymm5,%ymm5
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm5,%ymm5
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm5,%ymm5
+       vpxor   %ymm10,%ymm5,%ymm5
+       vpaddd  %ymm11,%ymm5,%ymm9
+       vmovdqu %ymm9,160(%rsp)
+       vpalignr        $8,%ymm2,%ymm3,%ymm6
+       vpsrldq $4,%ymm5,%ymm8
+       vpxor   %ymm2,%ymm6,%ymm6
+       vpxor   %ymm4,%ymm8,%ymm8
+       vpxor   %ymm8,%ymm6,%ymm6
+       vpsrld  $31,%ymm6,%ymm8
+       vpslldq $12,%ymm6,%ymm10
+       vpaddd  %ymm6,%ymm6,%ymm6
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm6,%ymm6
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm6,%ymm6
+       vpxor   %ymm10,%ymm6,%ymm6
+       vpaddd  %ymm11,%ymm6,%ymm9
+       vmovdqu %ymm9,192(%rsp)
+       vpalignr        $8,%ymm3,%ymm4,%ymm7
+       vpsrldq $4,%ymm6,%ymm8
+       vpxor   %ymm3,%ymm7,%ymm7
+       vpxor   %ymm5,%ymm8,%ymm8
+       vpxor   %ymm8,%ymm7,%ymm7
+       vpsrld  $31,%ymm7,%ymm8
+       vpslldq $12,%ymm7,%ymm10
+       vpaddd  %ymm7,%ymm7,%ymm7
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm7,%ymm7
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm7,%ymm7
+       vpxor   %ymm10,%ymm7,%ymm7
+       vpaddd  %ymm11,%ymm7,%ymm9
+       vmovdqu %ymm9,224(%rsp)
+       leaq    128(%rsp),%r13
+       jmp     .Loop_avx2
+.p2align       5
+.Loop_avx2:
+       rorxl   $2,%ebp,%ebx
+       andnl   %edx,%ebp,%edi
+       andl    %ecx,%ebp
+       xorl    %edi,%ebp
+       jmp     .Lalign32_1
+.p2align       5
+.Lalign32_1:
+       vpalignr        $8,%ymm6,%ymm7,%ymm8
+       vpxor   %ymm4,%ymm0,%ymm0
+       addl    -128(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       vpxor   %ymm1,%ymm0,%ymm0
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       vpxor   %ymm8,%ymm0,%ymm0
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       vpsrld  $30,%ymm0,%ymm8
+       vpslld  $2,%ymm0,%ymm0
+       addl    -124(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       vpor    %ymm8,%ymm0,%ymm0
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -120(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       vpaddd  %ymm11,%ymm0,%ymm9
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       vmovdqu %ymm9,256(%rsp)
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       addl    -116(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       addl    -96(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       andl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       vpalignr        $8,%ymm7,%ymm0,%ymm8
+       vpxor   %ymm5,%ymm1,%ymm1
+       addl    -92(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       vpxor   %ymm2,%ymm1,%ymm1
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       vpxor   %ymm8,%ymm1,%ymm1
+       andl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       vpsrld  $30,%ymm1,%ymm8
+       vpslld  $2,%ymm1,%ymm1
+       addl    -88(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       vpor    %ymm8,%ymm1,%ymm1
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    -84(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       vpaddd  %ymm11,%ymm1,%ymm9
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       vmovdqu %ymm9,288(%rsp)
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -64(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       addl    -60(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       vpalignr        $8,%ymm0,%ymm1,%ymm8
+       vpxor   %ymm6,%ymm2,%ymm2
+       addl    -56(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       vpxor   %ymm3,%ymm2,%ymm2
+       vmovdqu 0(%r14),%ymm11
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       vpxor   %ymm8,%ymm2,%ymm2
+       andl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       vpsrld  $30,%ymm2,%ymm8
+       vpslld  $2,%ymm2,%ymm2
+       addl    -52(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       andl    %ecx,%ebp
+       vpor    %ymm8,%ymm2,%ymm2
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       addl    -32(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       vpaddd  %ymm11,%ymm2,%ymm9
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       vmovdqu %ymm9,320(%rsp)
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    -28(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -24(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       vpalignr        $8,%ymm1,%ymm2,%ymm8
+       vpxor   %ymm7,%ymm3,%ymm3
+       addl    -20(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       vpxor   %ymm4,%ymm3,%ymm3
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       vpxor   %ymm8,%ymm3,%ymm3
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       vpsrld  $30,%ymm3,%ymm8
+       vpslld  $2,%ymm3,%ymm3
+       addl    0(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       andl    %edx,%ebx
+       vpor    %ymm8,%ymm3,%ymm3
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       addl    4(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       vpaddd  %ymm11,%ymm3,%ymm9
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       andl    %ecx,%ebp
+       vmovdqu %ymm9,352(%rsp)
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       addl    8(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    12(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       vpalignr        $8,%ymm2,%ymm3,%ymm8
+       vpxor   %ymm0,%ymm4,%ymm4
+       addl    32(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       vpxor   %ymm8,%ymm4,%ymm4
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    36(%r13),%ebx
+       vpsrld  $30,%ymm4,%ymm8
+       vpslld  $2,%ymm4,%ymm4
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       vpor    %ymm8,%ymm4,%ymm4
+       addl    40(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       vpaddd  %ymm11,%ymm4,%ymm9
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    44(%r13),%eax
+       vmovdqu %ymm9,384(%rsp)
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    64(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       vpalignr        $8,%ymm3,%ymm4,%ymm8
+       vpxor   %ymm1,%ymm5,%ymm5
+       addl    68(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       vpxor   %ymm6,%ymm5,%ymm5
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       vpxor   %ymm8,%ymm5,%ymm5
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    72(%r13),%ecx
+       vpsrld  $30,%ymm5,%ymm8
+       vpslld  $2,%ymm5,%ymm5
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       vpor    %ymm8,%ymm5,%ymm5
+       addl    76(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       vpaddd  %ymm11,%ymm5,%ymm9
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    96(%r13),%ebp
+       vmovdqu %ymm9,416(%rsp)
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    100(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       vpalignr        $8,%ymm4,%ymm5,%ymm8
+       vpxor   %ymm2,%ymm6,%ymm6
+       addl    104(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       vpxor   %ymm7,%ymm6,%ymm6
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       vpxor   %ymm8,%ymm6,%ymm6
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    108(%r13),%edx
+       leaq    256(%r13),%r13
+       vpsrld  $30,%ymm6,%ymm8
+       vpslld  $2,%ymm6,%ymm6
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       vpor    %ymm8,%ymm6,%ymm6
+       addl    -128(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       vpaddd  %ymm11,%ymm6,%ymm9
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -124(%r13),%ebx
+       vmovdqu %ymm9,448(%rsp)
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -120(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       vpalignr        $8,%ymm5,%ymm6,%ymm8
+       vpxor   %ymm3,%ymm7,%ymm7
+       addl    -116(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       vpxor   %ymm0,%ymm7,%ymm7
+       vmovdqu 32(%r14),%ymm11
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       vpxor   %ymm8,%ymm7,%ymm7
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -96(%r13),%esi
+       vpsrld  $30,%ymm7,%ymm8
+       vpslld  $2,%ymm7,%ymm7
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       vpor    %ymm8,%ymm7,%ymm7
+       addl    -92(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       vpaddd  %ymm11,%ymm7,%ymm9
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    -88(%r13),%ecx
+       vmovdqu %ymm9,480(%rsp)
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -84(%r13),%ebx
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       jmp     .Lalign32_2
+.p2align       5
+.Lalign32_2:
+       vpalignr        $8,%ymm6,%ymm7,%ymm8
+       vpxor   %ymm4,%ymm0,%ymm0
+       addl    -64(%r13),%ebp
+       xorl    %esi,%ecx
+       vpxor   %ymm1,%ymm0,%ymm0
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       vpxor   %ymm8,%ymm0,%ymm0
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       vpsrld  $30,%ymm0,%ymm8
+       vpslld  $2,%ymm0,%ymm0
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    -60(%r13),%eax
+       xorl    %edx,%ebx
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       vpor    %ymm8,%ymm0,%ymm0
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       vpaddd  %ymm11,%ymm0,%ymm9
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    -56(%r13),%esi
+       xorl    %ecx,%ebp
+       vmovdqu %ymm9,512(%rsp)
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       addl    -52(%r13),%edx
+       xorl    %ebx,%eax
+       movl    %ebp,%edi
+       xorl    %ebx,%edi
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       andl    %edi,%esi
+       addl    -32(%r13),%ecx
+       xorl    %ebp,%esi
+       movl    %eax,%edi
+       xorl    %ebp,%edi
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       vpalignr        $8,%ymm7,%ymm0,%ymm8
+       vpxor   %ymm5,%ymm1,%ymm1
+       addl    -28(%r13),%ebx
+       xorl    %eax,%edx
+       vpxor   %ymm2,%ymm1,%ymm1
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       vpxor   %ymm8,%ymm1,%ymm1
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       vpsrld  $30,%ymm1,%ymm8
+       vpslld  $2,%ymm1,%ymm1
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    -24(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       vpor    %ymm8,%ymm1,%ymm1
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       vpaddd  %ymm11,%ymm1,%ymm9
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    -20(%r13),%eax
+       xorl    %edx,%ebx
+       vmovdqu %ymm9,544(%rsp)
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    0(%r13),%esi
+       xorl    %ecx,%ebp
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       addl    4(%r13),%edx
+       xorl    %ebx,%eax
+       movl    %ebp,%edi
+       xorl    %ebx,%edi
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       andl    %edi,%esi
+       vpalignr        $8,%ymm0,%ymm1,%ymm8
+       vpxor   %ymm6,%ymm2,%ymm2
+       addl    8(%r13),%ecx
+       xorl    %ebp,%esi
+       vpxor   %ymm3,%ymm2,%ymm2
+       movl    %eax,%edi
+       xorl    %ebp,%edi
+       leal    (%rcx,%rsi,1),%ecx
+       vpxor   %ymm8,%ymm2,%ymm2
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       vpsrld  $30,%ymm2,%ymm8
+       vpslld  $2,%ymm2,%ymm2
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       addl    12(%r13),%ebx
+       xorl    %eax,%edx
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       vpor    %ymm8,%ymm2,%ymm2
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       vpaddd  %ymm11,%ymm2,%ymm9
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    32(%r13),%ebp
+       xorl    %esi,%ecx
+       vmovdqu %ymm9,576(%rsp)
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    36(%r13),%eax
+       xorl    %edx,%ebx
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    40(%r13),%esi
+       xorl    %ecx,%ebp
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       vpalignr        $8,%ymm1,%ymm2,%ymm8
+       vpxor   %ymm7,%ymm3,%ymm3
+       addl    44(%r13),%edx
+       xorl    %ebx,%eax
+       vpxor   %ymm4,%ymm3,%ymm3
+       movl    %ebp,%edi
+       xorl    %ebx,%edi
+       leal    (%rdx,%rax,1),%edx
+       vpxor   %ymm8,%ymm3,%ymm3
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       vpsrld  $30,%ymm3,%ymm8
+       vpslld  $2,%ymm3,%ymm3
+       addl    %r12d,%edx
+       andl    %edi,%esi
+       addl    64(%r13),%ecx
+       xorl    %ebp,%esi
+       movl    %eax,%edi
+       xorl    %ebp,%edi
+       vpor    %ymm8,%ymm3,%ymm3
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       vpaddd  %ymm11,%ymm3,%ymm9
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       addl    68(%r13),%ebx
+       xorl    %eax,%edx
+       vmovdqu %ymm9,608(%rsp)
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    72(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    76(%r13),%eax
+       xorl    %edx,%ebx
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    96(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    100(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    104(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    108(%r13),%ebx
+       leaq    256(%r13),%r13
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -128(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    -124(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -120(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    -116(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    -96(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -92(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -88(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    -84(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -64(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    -60(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    -56(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -52(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -32(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    -28(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -24(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    -20(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       addl    %r12d,%edx
+       leaq    128(%r9),%r13
+       leaq    128(%r9),%rdi
+       cmpq    %r10,%r13
+       cmovaeq %r9,%r13
+
+
+       addl    0(%r8),%edx
+       addl    4(%r8),%esi
+       addl    8(%r8),%ebp
+       movl    %edx,0(%r8)
+       addl    12(%r8),%ebx
+       movl    %esi,4(%r8)
+       movl    %edx,%eax
+       addl    16(%r8),%ecx
+       movl    %ebp,%r12d
+       movl    %ebp,8(%r8)
+       movl    %ebx,%edx
+
+       movl    %ebx,12(%r8)
+       movl    %esi,%ebp
+       movl    %ecx,16(%r8)
+
+       movl    %ecx,%esi
+       movl    %r12d,%ecx
+
+
+       cmpq    %r10,%r9
+       je      .Ldone_avx2
+       vmovdqu 64(%r14),%ymm6
+       cmpq    %r10,%rdi
+       ja      .Last_avx2
+
+       vmovdqu -64(%rdi),%xmm0
+       vmovdqu -48(%rdi),%xmm1
+       vmovdqu -32(%rdi),%xmm2
+       vmovdqu -16(%rdi),%xmm3
+       vinserti128     $1,0(%r13),%ymm0,%ymm0
+       vinserti128     $1,16(%r13),%ymm1,%ymm1
+       vinserti128     $1,32(%r13),%ymm2,%ymm2
+       vinserti128     $1,48(%r13),%ymm3,%ymm3
+       jmp     .Last_avx2
+
+.p2align       5
+.Last_avx2:
+       leaq    128+16(%rsp),%r13
+       rorxl   $2,%ebp,%ebx
+       andnl   %edx,%ebp,%edi
+       andl    %ecx,%ebp
+       xorl    %edi,%ebp
+       subq    $-128,%r9
+       addl    -128(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    -124(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -120(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       addl    -116(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       addl    -96(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       andl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       addl    -92(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       andl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       addl    -88(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    -84(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -64(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       addl    -60(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       addl    -56(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       andl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       addl    -52(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       andl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       addl    -32(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    -28(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -24(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       addl    -20(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       addl    0(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       andl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       addl    4(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       andl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       addl    8(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    12(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    32(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    36(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    40(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    44(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    64(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       vmovdqu -64(%r14),%ymm11
+       vpshufb %ymm6,%ymm0,%ymm0
+       addl    68(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    72(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    76(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    96(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    100(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       vpshufb %ymm6,%ymm1,%ymm1
+       vpaddd  %ymm11,%ymm0,%ymm8
+       addl    104(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    108(%r13),%edx
+       leaq    256(%r13),%r13
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    -128(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -124(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -120(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       vmovdqu %ymm8,0(%rsp)
+       vpshufb %ymm6,%ymm2,%ymm2
+       vpaddd  %ymm11,%ymm1,%ymm9
+       addl    -116(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -96(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    -92(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    -88(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -84(%r13),%ebx
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       vmovdqu %ymm9,32(%rsp)
+       vpshufb %ymm6,%ymm3,%ymm3
+       vpaddd  %ymm11,%ymm2,%ymm6
+       addl    -64(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    -60(%r13),%eax
+       xorl    %edx,%ebx
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    -56(%r13),%esi
+       xorl    %ecx,%ebp
+       movl    %ebx,%edi
        xorl    %ecx,%edi
-       movl    %eax,%esi
-       roll    $5,%eax
-       addl    %edi,%ebp
-       xorl    %ecx,%esi
-       rorl    $7,%ebx
-       addl    %eax,%ebp
-       addl    48(%rsp),%edx
-       xorl    %ebx,%esi
-.byte  102,15,56,0,222
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       addl    -52(%r13),%edx
+       xorl    %ebx,%eax
        movl    %ebp,%edi
-       roll    $5,%ebp
-       paddd   %xmm9,%xmm2
-       addl    %esi,%edx
        xorl    %ebx,%edi
-       rorl    $7,%eax
-       addl    %ebp,%edx
-       movdqa  %xmm2,32(%rsp)
-       addl    52(%rsp),%ecx
-       xorl    %eax,%edi
-       psubd   %xmm9,%xmm2
-       movl    %edx,%esi
-       roll    $5,%edx
-       addl    %edi,%ecx
-       xorl    %eax,%esi
-       rorl    $7,%ebp
-       addl    %edx,%ecx
-       addl    56(%rsp),%ebx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
        xorl    %ebp,%esi
-       movl    %ecx,%edi
-       roll    $5,%ecx
-       addl    %esi,%ebx
-       xorl    %ebp,%edi
-       rorl    $7,%edx
-       addl    %ecx,%ebx
-       addl    60(%rsp),%eax
-       xorl    %edx,%edi
-       movl    %ebx,%esi
-       roll    $5,%ebx
-       addl    %edi,%eax
-       rorl    $7,%ecx
-       addl    %ebx,%eax
-       addl    0(%r8),%eax
-       addl    4(%r8),%esi
-       addl    8(%r8),%ecx
-       addl    12(%r8),%edx
-       movl    %eax,0(%r8)
-       addl    16(%r8),%ebp
-       movl    %esi,4(%r8)
-       movl    %esi,%ebx
-       movl    %ecx,8(%r8)
-       movl    %ecx,%edi
-       movl    %edx,12(%r8)
-       xorl    %edx,%edi
-       movl    %ebp,16(%r8)
+       addl    %r12d,%edx
        andl    %edi,%esi
-       jmp     .Loop_ssse3
-
-.p2align       4
-.Ldone_ssse3:
-       addl    16(%rsp),%ebx
+       addl    -32(%r13),%ecx
        xorl    %ebp,%esi
-       movl    %ecx,%edi
-       roll    $5,%ecx
-       addl    %esi,%ebx
+       movl    %eax,%edi
        xorl    %ebp,%edi
-       rorl    $7,%edx
-       addl    %ecx,%ebx
-       addl    20(%rsp),%eax
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       jmp     .Lalign32_3
+.p2align       5
+.Lalign32_3:
+       vmovdqu %ymm6,64(%rsp)
+       vpaddd  %ymm11,%ymm3,%ymm7
+       addl    -28(%r13),%ebx
+       xorl    %eax,%edx
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    -24(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    -20(%r13),%eax
+       xorl    %edx,%ebx
+       movl    %ecx,%edi
        xorl    %edx,%edi
-       movl    %ebx,%esi
-       roll    $5,%ebx
-       addl    %edi,%eax
-       xorl    %edx,%esi
-       rorl    $7,%ecx
-       addl    %ebx,%eax
-       addl    24(%rsp),%ebp
-       xorl    %ecx,%esi
-       movl    %eax,%edi
-       roll    $5,%eax
-       addl    %esi,%ebp
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    0(%r13),%esi
+       xorl    %ecx,%ebp
+       movl    %ebx,%edi
        xorl    %ecx,%edi
-       rorl    $7,%ebx
-       addl    %eax,%ebp
-       addl    28(%rsp),%edx
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       addl    4(%r13),%edx
+       xorl    %ebx,%eax
+       movl    %ebp,%edi
        xorl    %ebx,%edi
-       movl    %ebp,%esi
-       roll    $5,%ebp
-       addl    %edi,%edx
-       xorl    %ebx,%esi
-       rorl    $7,%eax
-       addl    %ebp,%edx
-       addl    32(%rsp),%ecx
-       xorl    %eax,%esi
-       movl    %edx,%edi
-       roll    $5,%edx
-       addl    %esi,%ecx
-       xorl    %eax,%edi
-       rorl    $7,%ebp
-       addl    %edx,%ecx
-       addl    36(%rsp),%ebx
-       xorl    %ebp,%edi
-       movl    %ecx,%esi
-       roll    $5,%ecx
-       addl    %edi,%ebx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
        xorl    %ebp,%esi
-       rorl    $7,%edx
-       addl    %ecx,%ebx
-       addl    40(%rsp),%eax
-       xorl    %edx,%esi
-       movl    %ebx,%edi
-       roll    $5,%ebx
-       addl    %esi,%eax
+       addl    %r12d,%edx
+       andl    %edi,%esi
+       vmovdqu %ymm7,96(%rsp)
+       addl    8(%r13),%ecx
+       xorl    %ebp,%esi
+       movl    %eax,%edi
+       xorl    %ebp,%edi
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       addl    12(%r13),%ebx
+       xorl    %eax,%edx
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    32(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    36(%r13),%eax
+       xorl    %edx,%ebx
+       movl    %ecx,%edi
        xorl    %edx,%edi
-       rorl    $7,%ecx
-       addl    %ebx,%eax
-       addl    44(%rsp),%ebp
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    40(%r13),%esi
+       xorl    %ecx,%ebp
+       movl    %ebx,%edi
        xorl    %ecx,%edi
-       movl    %eax,%esi
-       roll    $5,%eax
-       addl    %edi,%ebp
-       xorl    %ecx,%esi
-       rorl    $7,%ebx
-       addl    %eax,%ebp
-       addl    48(%rsp),%edx
-       xorl    %ebx,%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       vpalignr        $8,%ymm0,%ymm1,%ymm4
+       addl    44(%r13),%edx
+       xorl    %ebx,%eax
        movl    %ebp,%edi
-       roll    $5,%ebp
-       addl    %esi,%edx
        xorl    %ebx,%edi
-       rorl    $7,%eax
-       addl    %ebp,%edx
-       addl    52(%rsp),%ecx
-       xorl    %eax,%edi
-       movl    %edx,%esi
-       roll    $5,%edx
-       addl    %edi,%ecx
-       xorl    %eax,%esi
-       rorl    $7,%ebp
-       addl    %edx,%ecx
-       addl    56(%rsp),%ebx
+       vpsrldq $4,%ymm3,%ymm8
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       vpxor   %ymm0,%ymm4,%ymm4
+       vpxor   %ymm2,%ymm8,%ymm8
        xorl    %ebp,%esi
-       movl    %ecx,%edi
-       roll    $5,%ecx
-       addl    %esi,%ebx
+       addl    %r12d,%edx
+       vpxor   %ymm8,%ymm4,%ymm4
+       andl    %edi,%esi
+       addl    64(%r13),%ecx
+       xorl    %ebp,%esi
+       movl    %eax,%edi
+       vpsrld  $31,%ymm4,%ymm8
        xorl    %ebp,%edi
-       rorl    $7,%edx
-       addl    %ecx,%ebx
-       addl    60(%rsp),%eax
-       xorl    %edx,%edi
-       movl    %ebx,%esi
-       roll    $5,%ebx
-       addl    %edi,%eax
-       rorl    $7,%ecx
-       addl    %ebx,%eax
-       addl    0(%r8),%eax
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       vpslldq $12,%ymm4,%ymm10
+       vpaddd  %ymm4,%ymm4,%ymm4
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm4,%ymm4
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm4,%ymm4
+       addl    68(%r13),%ebx
+       xorl    %eax,%edx
+       vpxor   %ymm10,%ymm4,%ymm4
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       vpaddd  %ymm11,%ymm4,%ymm9
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       vmovdqu %ymm9,128(%rsp)
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    72(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    76(%r13),%eax
+       xorl    %edx,%ebx
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       vpalignr        $8,%ymm1,%ymm2,%ymm5
+       addl    96(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       vpsrldq $4,%ymm4,%ymm8
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       vpxor   %ymm1,%ymm5,%ymm5
+       vpxor   %ymm3,%ymm8,%ymm8
+       addl    100(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       vpxor   %ymm8,%ymm5,%ymm5
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       vpsrld  $31,%ymm5,%ymm8
+       vmovdqu -32(%r14),%ymm11
+       xorl    %ebx,%esi
+       addl    104(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       vpslldq $12,%ymm5,%ymm10
+       vpaddd  %ymm5,%ymm5,%ymm5
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm5,%ymm5
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm5,%ymm5
+       xorl    %ebp,%edx
+       addl    108(%r13),%ebx
+       leaq    256(%r13),%r13
+       vpxor   %ymm10,%ymm5,%ymm5
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       vpaddd  %ymm11,%ymm5,%ymm9
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       vmovdqu %ymm9,160(%rsp)
+       addl    -128(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       vpalignr        $8,%ymm2,%ymm3,%ymm6
+       addl    -124(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       vpsrldq $4,%ymm5,%ymm8
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       vpxor   %ymm2,%ymm6,%ymm6
+       vpxor   %ymm4,%ymm8,%ymm8
+       addl    -120(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       vpxor   %ymm8,%ymm6,%ymm6
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       vpsrld  $31,%ymm6,%ymm8
+       xorl    %ecx,%eax
+       addl    -116(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       vpslldq $12,%ymm6,%ymm10
+       vpaddd  %ymm6,%ymm6,%ymm6
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm6,%ymm6
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm6,%ymm6
+       xorl    %ebx,%esi
+       addl    -96(%r13),%ecx
+       vpxor   %ymm10,%ymm6,%ymm6
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       vpaddd  %ymm11,%ymm6,%ymm9
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       vmovdqu %ymm9,192(%rsp)
+       addl    -92(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       vpalignr        $8,%ymm3,%ymm4,%ymm7
+       addl    -88(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       vpsrldq $4,%ymm6,%ymm8
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       vpxor   %ymm3,%ymm7,%ymm7
+       vpxor   %ymm5,%ymm8,%ymm8
+       addl    -84(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       vpxor   %ymm8,%ymm7,%ymm7
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       vpsrld  $31,%ymm7,%ymm8
+       xorl    %edx,%ebp
+       addl    -64(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       vpslldq $12,%ymm7,%ymm10
+       vpaddd  %ymm7,%ymm7,%ymm7
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm7,%ymm7
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm7,%ymm7
+       xorl    %ecx,%eax
+       addl    -60(%r13),%edx
+       vpxor   %ymm10,%ymm7,%ymm7
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       vpaddd  %ymm11,%ymm7,%ymm9
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       vmovdqu %ymm9,224(%rsp)
+       addl    -56(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -52(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -32(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    -28(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -24(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    -20(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       addl    %r12d,%edx
+       leaq    128(%rsp),%r13
+
+
+       addl    0(%r8),%edx
        addl    4(%r8),%esi
-       addl    8(%r8),%ecx
-       movl    %eax,0(%r8)
-       addl    12(%r8),%edx
+       addl    8(%r8),%ebp
+       movl    %edx,0(%r8)
+       addl    12(%r8),%ebx
        movl    %esi,4(%r8)
-       addl    16(%r8),%ebp
-       movl    %ecx,8(%r8)
-       movl    %edx,12(%r8)
-       movl    %ebp,16(%r8)
-       movaps  64+0(%rsp),%xmm6
-       movaps  64+16(%rsp),%xmm7
-       movaps  64+32(%rsp),%xmm8
-       movaps  64+48(%rsp),%xmm9
-       movaps  64+64(%rsp),%xmm10
-       movaps  64+80(%rsp),%xmm11
-       leaq    160(%rsp),%rsi
-       movq    0(%rsi),%r12
-       movq    8(%rsi),%rbp
-       movq    16(%rsi),%rbx
-       leaq    24(%rsi),%rsp
-.Lepilogue_ssse3:
+       movl    %edx,%eax
+       addl    16(%r8),%ecx
+       movl    %ebp,%r12d
+       movl    %ebp,8(%r8)
+       movl    %ebx,%edx
+
+       movl    %ebx,12(%r8)
+       movl    %esi,%ebp
+       movl    %ecx,16(%r8)
+
+       movl    %ecx,%esi
+       movl    %r12d,%ecx
+
+
+       cmpq    %r10,%r9
+       jbe     .Loop_avx2
+
+.Ldone_avx2:
+       vzeroupper
+       movaps  -40-96(%r11),%xmm6
+       movaps  -40-80(%r11),%xmm7
+       movaps  -40-64(%r11),%xmm8
+       movaps  -40-48(%r11),%xmm9
+       movaps  -40-32(%r11),%xmm10
+       movaps  -40-16(%r11),%xmm11
+       movq    -40(%r11),%r14
+
+       movq    -32(%r11),%r13
+
+       movq    -24(%r11),%r12
+
+       movq    -16(%r11),%rbp
+
+       movq    -8(%r11),%rbx
+
+       leaq    (%r11),%rsp
+
+.Lepilogue_avx2:
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
-.LSEH_end_sha1_block_data_order_ssse3:
+
+.LSEH_end_sha1_block_data_order_avx2:
 .p2align       6
 K_XX_XX:
-.long  0x5a827999,0x5a827999,0x5a827999,0x5a827999     
-.long  0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     
-.long  0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     
-.long  0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     
-.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     
+.long  0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long  0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long  0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long  0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long  0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long  0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long  0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long  0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte  0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
 .byte  83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .p2align       6
 
@@ -2567,19 +5618,51 @@ se_handler:
        jae     .Lcommon_seh_tail
 
        movq    64(%rax),%rax
-       leaq    32(%rax),%rax
 
        movq    -8(%rax),%rbx
        movq    -16(%rax),%rbp
        movq    -24(%rax),%r12
        movq    -32(%rax),%r13
+       movq    -40(%rax),%r14
        movq    %rbx,144(%r8)
        movq    %rbp,160(%r8)
        movq    %r12,216(%r8)
        movq    %r13,224(%r8)
+       movq    %r14,232(%r8)
 
        jmp     .Lcommon_seh_tail
 
+.def   shaext_handler; .scl 3; .type 32;       .endef
+.p2align       4
+shaext_handler:
+       pushq   %rsi
+       pushq   %rdi
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       pushfq
+       subq    $64,%rsp
+
+       movq    120(%r8),%rax
+       movq    248(%r8),%rbx
+
+       leaq    .Lprologue_shaext(%rip),%r10
+       cmpq    %r10,%rbx
+       jb      .Lcommon_seh_tail
+
+       leaq    .Lepilogue_shaext(%rip),%r10
+       cmpq    %r10,%rbx
+       jae     .Lcommon_seh_tail
+
+       leaq    -8-64(%rax),%rsi
+       leaq    512(%r8),%rdi
+       movl    $8,%ecx
+.long  0xa548f3fc
+
+       jmp     .Lcommon_seh_tail
 
 .def   ssse3_handler;  .scl 3; .type 32;       .endef
 .p2align       4
@@ -2606,25 +5689,28 @@ ssse3_handler:
        cmpq    %r10,%rbx
        jb      .Lcommon_seh_tail
 
-       movq    152(%r8),%rax
+       movq    208(%r8),%rax
 
        movl    4(%r11),%r10d
        leaq    (%rsi,%r10,1),%r10
        cmpq    %r10,%rbx
        jae     .Lcommon_seh_tail
 
-       leaq    64(%rax),%rsi
+       leaq    -40-96(%rax),%rsi
        leaq    512(%r8),%rdi
        movl    $12,%ecx
-.long  0xa548f3fc              
-       leaq    184(%rax),%rax
+.long  0xa548f3fc
 
        movq    -8(%rax),%rbx
        movq    -16(%rax),%rbp
        movq    -24(%rax),%r12
+       movq    -32(%rax),%r13
+       movq    -40(%rax),%r14
        movq    %rbx,144(%r8)
        movq    %rbp,160(%r8)
        movq    %r12,216(%r8)
+       movq    %r13,224(%r8)
+       movq    %r14,232(%r8)
 
 .Lcommon_seh_tail:
        movq    8(%rax),%rdi
@@ -2636,7 +5722,7 @@ ssse3_handler:
        movq    40(%r9),%rdi
        movq    %r8,%rsi
        movl    $154,%ecx
-.long  0xa548f3fc              
+.long  0xa548f3fc
 
        movq    %r9,%rsi
        xorq    %rcx,%rcx
@@ -2671,16 +5757,36 @@ ssse3_handler:
 .rva   .LSEH_begin_sha1_block_data_order
 .rva   .LSEH_end_sha1_block_data_order
 .rva   .LSEH_info_sha1_block_data_order
+.rva   .LSEH_begin_sha1_block_data_order_shaext
+.rva   .LSEH_end_sha1_block_data_order_shaext
+.rva   .LSEH_info_sha1_block_data_order_shaext
 .rva   .LSEH_begin_sha1_block_data_order_ssse3
 .rva   .LSEH_end_sha1_block_data_order_ssse3
 .rva   .LSEH_info_sha1_block_data_order_ssse3
+.rva   .LSEH_begin_sha1_block_data_order_avx
+.rva   .LSEH_end_sha1_block_data_order_avx
+.rva   .LSEH_info_sha1_block_data_order_avx
+.rva   .LSEH_begin_sha1_block_data_order_avx2
+.rva   .LSEH_end_sha1_block_data_order_avx2
+.rva   .LSEH_info_sha1_block_data_order_avx2
 .section       .xdata
 .p2align       3
 .LSEH_info_sha1_block_data_order:
 .byte  9,0,0,0
 .rva   se_handler
+.LSEH_info_sha1_block_data_order_shaext:
+.byte  9,0,0,0
+.rva   shaext_handler
 .LSEH_info_sha1_block_data_order_ssse3:
 .byte  9,0,0,0
 .rva   ssse3_handler
-.rva   .Lprologue_ssse3,.Lepilogue_ssse3       
+.rva   .Lprologue_ssse3,.Lepilogue_ssse3
+.LSEH_info_sha1_block_data_order_avx:
+.byte  9,0,0,0
+.rva   ssse3_handler
+.rva   .Lprologue_avx,.Lepilogue_avx
+.LSEH_info_sha1_block_data_order_avx2:
+.byte  9,0,0,0
+.rva   ssse3_handler
+.rva   .Lprologue_avx2,.Lepilogue_avx2
 
index eaa435408ed47ca87e30bb246c2e792815372552..05cd61d1b14e418410c04a1978e6de592697d0d7 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,6 @@
 #
 # *** This file is auto-generated ***
 #
-.file  "sha512-586.s"
 .text
 .globl _sha256_block_data_order
 .def   _sha256_block_data_order;       .scl    2;      .type   32;     .endef
@@ -64,20 +63,6 @@ _sha256_block_data_order:
        movl    %edi,4(%esp)
        movl    %eax,8(%esp)
        movl    %ebx,12(%esp)
-       leal    __gnutls_x86_cpuid_s-.L001K256(%ebp),%edx
-       movl    (%edx),%ecx
-       movl    4(%edx),%edx
-       testl   $1048576,%ecx
-       jnz     .L002loop
-       testl   $2048,%edx
-       andl    $1073741824,%ecx
-       andl    $268435456,%edx
-       orl     %edx,%ecx
-       cmpl    $1342177280,%ecx
-       je      .L003loop_shrd
-       subl    %edi,%eax
-       cmpl    $256,%eax
-       jae     .L004unrolled
        jmp     .L002loop
 .align 16
 .L002loop:
@@ -149,7 +134,7 @@ _sha256_block_data_order:
        movl    %ecx,28(%esp)
        movl    %edi,32(%esp)
 .align 16
-.L00500_15:
+.L00300_15:
        movl    %edx,%ecx
        movl    24(%esp),%esi
        rorl    $14,%ecx
@@ -187,11 +172,11 @@ _sha256_block_data_order:
        addl    $4,%ebp
        addl    %ebx,%eax
        cmpl    $3248222580,%esi
-       jne     .L00500_15
+       jne     .L00300_15
        movl    156(%esp),%ecx
-       jmp     .L00616_63
+       jmp     .L00416_63
 .align 16
-.L00616_63:
+.L00416_63:
        movl    %ecx,%ebx
        movl    104(%esp),%esi
        rorl    $11,%ecx
@@ -246,7 +231,7 @@ _sha256_block_data_order:
        addl    $4,%ebp
        addl    %ebx,%eax
        cmpl    $3329325298,%esi
-       jne     .L00616_63
+       jne     .L00416_63
        movl    356(%esp),%esi
        movl    8(%esp),%ebx
        movl    16(%esp),%ecx
@@ -280,8 +265,8 @@ _sha256_block_data_order:
        popl    %ebx
        popl    %ebp
        ret
-.align 16
-.L003loop_shrd:
+.align 32
+.L005loop_shrd:
        movl    (%edi),%eax
        movl    4(%edi),%ebx
        movl    8(%edi),%ecx
@@ -350,7 +335,7 @@ _sha256_block_data_order:
        movl    %ecx,28(%esp)
        movl    %edi,32(%esp)
 .align 16
-.L00700_15_shrd:
+.L00600_15_shrd:
        movl    %edx,%ecx
        movl    24(%esp),%esi
        shrdl   $14,%ecx,%ecx
@@ -388,11 +373,11 @@ _sha256_block_data_order:
        addl    $4,%ebp
        addl    %ebx,%eax
        cmpl    $3248222580,%esi
-       jne     .L00700_15_shrd
+       jne     .L00600_15_shrd
        movl    156(%esp),%ecx
-       jmp     .L00816_63_shrd
+       jmp     .L00716_63_shrd
 .align 16
-.L00816_63_shrd:
+.L00716_63_shrd:
        movl    %ecx,%ebx
        movl    104(%esp),%esi
        shrdl   $11,%ecx,%ecx
@@ -447,7 +432,7 @@ _sha256_block_data_order:
        addl    $4,%ebp
        addl    %ebx,%eax
        cmpl    $3329325298,%esi
-       jne     .L00816_63_shrd
+       jne     .L00716_63_shrd
        movl    356(%esp),%esi
        movl    8(%esp),%ebx
        movl    16(%esp),%ecx
@@ -474,7 +459,7 @@ _sha256_block_data_order:
        leal    356(%esp),%esp
        subl    $256,%ebp
        cmpl    8(%esp),%edi
-       jb      .L003loop_shrd
+       jb      .L005loop_shrd
        movl    12(%esp),%esp
        popl    %edi
        popl    %esi
@@ -485,8 +470,13 @@ _sha256_block_data_order:
 .L001K256:
 .long  1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
 .long  66051,67438087,134810123,202182159
+.byte  83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+.byte  110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte  67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte  112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte  62,0
 .align 16
-.L004unrolled:
+.L008unrolled:
        leal    -96(%esp),%esp
        movl    (%esi),%eax
        movl    4(%esi),%ebp
@@ -3392,10 +3382,4 @@ _sha256_block_data_order:
        popl    %ebx
        popl    %ebp
        ret
-.byte  83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
-.byte  110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
-.byte  67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
-.byte  112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
-.byte  62,0
-.comm  __gnutls_x86_cpuid_s,16
 
diff --git a/lib/accelerated/x86/coff/sha256-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha256-ssse3-x86_64.s
new file mode 100644 (file)
index 0000000..d2fc195
--- /dev/null
@@ -0,0 +1,5731 @@
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 
+#     * Redistributions of source code must retain copyright notices,
+#      this list of conditions and the following disclaimer.
+#
+#     * Redistributions in binary form must reproduce the above
+#      copyright notice, this list of conditions and the following
+#      disclaimer in the documentation and/or other materials
+#      provided with the distribution.
+#
+#     * Neither the name of the Andy Polyakov nor the names of its
+#      copyright holder and contributors may be used to endorse or
+#      promote products derived from this software without specific
+#      prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *** This file is auto-generated ***
+#
+.text  
+
+
+.globl sha256_block_data_order
+.def   sha256_block_data_order;        .scl 2; .type 32;       .endef
+.p2align       4
+sha256_block_data_order:
+       movq    %rdi,8(%rsp)
+       movq    %rsi,16(%rsp)
+       movq    %rsp,%rax
+.LSEH_begin_sha256_block_data_order:
+       movq    %rcx,%rdi
+       movq    %rdx,%rsi
+       movq    %r8,%rdx
+
+
+       leaq    _gnutls_x86_cpuid_s(%rip),%r11
+       movl    0(%r11),%r9d
+       movl    4(%r11),%r10d
+       movl    8(%r11),%r11d
+       testl   $536870912,%r11d
+       jnz     _shaext_shortcut
+       andl    $296,%r11d
+       cmpl    $296,%r11d
+       je      .Lavx2_shortcut
+       andl    $1073741824,%r9d
+       andl    $268435968,%r10d
+       orl     %r9d,%r10d
+       cmpl    $1342177792,%r10d
+       je      .Lavx_shortcut
+       testl   $512,%r10d
+       jnz     .Lssse3_shortcut
+       movq    %rsp,%rax
+
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       pushq   %r15
+
+       shlq    $4,%rdx
+       subq    $64+32,%rsp
+       leaq    (%rsi,%rdx,4),%rdx
+       andq    $-64,%rsp
+       movq    %rdi,64+0(%rsp)
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+       movq    %rax,88(%rsp)
+
+.Lprologue:
+
+       movl    0(%rdi),%eax
+       movl    4(%rdi),%ebx
+       movl    8(%rdi),%ecx
+       movl    12(%rdi),%edx
+       movl    16(%rdi),%r8d
+       movl    20(%rdi),%r9d
+       movl    24(%rdi),%r10d
+       movl    28(%rdi),%r11d
+       jmp     .Lloop
+
+.p2align       4
+.Lloop:
+       movl    %ebx,%edi
+       leaq    K256(%rip),%rbp
+       xorl    %ecx,%edi
+       movl    0(%rsi),%r12d
+       movl    %r8d,%r13d
+       movl    %eax,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r9d,%r15d
+
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r15d
+
+       movl    %r12d,0(%rsp)
+       xorl    %eax,%r14d
+       andl    %r8d,%r15d
+
+       rorl    $5,%r13d
+       addl    %r11d,%r12d
+       xorl    %r10d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r8d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %eax,%r15d
+       addl    (%rbp),%r12d
+       xorl    %eax,%r14d
+
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       movl    %ebx,%r11d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r11d
+       addl    %r12d,%edx
+       addl    %r12d,%r11d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r11d
+       movl    4(%rsi),%r12d
+       movl    %edx,%r13d
+       movl    %r11d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r8d,%edi
+
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%edi
+
+       movl    %r12d,4(%rsp)
+       xorl    %r11d,%r14d
+       andl    %edx,%edi
+
+       rorl    $5,%r13d
+       addl    %r10d,%r12d
+       xorl    %r9d,%edi
+
+       rorl    $11,%r14d
+       xorl    %edx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r11d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r11d,%r14d
+
+       xorl    %eax,%edi
+       rorl    $6,%r13d
+       movl    %eax,%r10d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r10d
+       addl    %r12d,%ecx
+       addl    %r12d,%r10d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r10d
+       movl    8(%rsi),%r12d
+       movl    %ecx,%r13d
+       movl    %r10d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %edx,%r15d
+
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r15d
+
+       movl    %r12d,8(%rsp)
+       xorl    %r10d,%r14d
+       andl    %ecx,%r15d
+
+       rorl    $5,%r13d
+       addl    %r9d,%r12d
+       xorl    %r8d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %ecx,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r10d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r10d,%r14d
+
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       movl    %r11d,%r9d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r9d
+       addl    %r12d,%ebx
+       addl    %r12d,%r9d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r9d
+       movl    12(%rsi),%r12d
+       movl    %ebx,%r13d
+       movl    %r9d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %ecx,%edi
+
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%edi
+
+       movl    %r12d,12(%rsp)
+       xorl    %r9d,%r14d
+       andl    %ebx,%edi
+
+       rorl    $5,%r13d
+       addl    %r8d,%r12d
+       xorl    %edx,%edi
+
+       rorl    $11,%r14d
+       xorl    %ebx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r9d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r9d,%r14d
+
+       xorl    %r10d,%edi
+       rorl    $6,%r13d
+       movl    %r10d,%r8d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r8d
+       addl    %r12d,%eax
+       addl    %r12d,%r8d
+
+       leaq    20(%rbp),%rbp
+       addl    %r14d,%r8d
+       movl    16(%rsi),%r12d
+       movl    %eax,%r13d
+       movl    %r8d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %ebx,%r15d
+
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r15d
+
+       movl    %r12d,16(%rsp)
+       xorl    %r8d,%r14d
+       andl    %eax,%r15d
+
+       rorl    $5,%r13d
+       addl    %edx,%r12d
+       xorl    %ecx,%r15d
+
+       rorl    $11,%r14d
+       xorl    %eax,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r8d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r8d,%r14d
+
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       movl    %r9d,%edx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%edx
+       addl    %r12d,%r11d
+       addl    %r12d,%edx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%edx
+       movl    20(%rsi),%r12d
+       movl    %r11d,%r13d
+       movl    %edx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %eax,%edi
+
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%edi
+
+       movl    %r12d,20(%rsp)
+       xorl    %edx,%r14d
+       andl    %r11d,%edi
+
+       rorl    $5,%r13d
+       addl    %ecx,%r12d
+       xorl    %ebx,%edi
+
+       rorl    $11,%r14d
+       xorl    %r11d,%r13d
+       addl    %edi,%r12d
+
+       movl    %edx,%edi
+       addl    (%rbp),%r12d
+       xorl    %edx,%r14d
+
+       xorl    %r8d,%edi
+       rorl    $6,%r13d
+       movl    %r8d,%ecx
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%ecx
+       addl    %r12d,%r10d
+       addl    %r12d,%ecx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%ecx
+       movl    24(%rsi),%r12d
+       movl    %r10d,%r13d
+       movl    %ecx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r11d,%r15d
+
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r15d
+
+       movl    %r12d,24(%rsp)
+       xorl    %ecx,%r14d
+       andl    %r10d,%r15d
+
+       rorl    $5,%r13d
+       addl    %ebx,%r12d
+       xorl    %eax,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r10d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %ecx,%r15d
+       addl    (%rbp),%r12d
+       xorl    %ecx,%r14d
+
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       movl    %edx,%ebx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%ebx
+       addl    %r12d,%r9d
+       addl    %r12d,%ebx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%ebx
+       movl    28(%rsi),%r12d
+       movl    %r9d,%r13d
+       movl    %ebx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r10d,%edi
+
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%edi
+
+       movl    %r12d,28(%rsp)
+       xorl    %ebx,%r14d
+       andl    %r9d,%edi
+
+       rorl    $5,%r13d
+       addl    %eax,%r12d
+       xorl    %r11d,%edi
+
+       rorl    $11,%r14d
+       xorl    %r9d,%r13d
+       addl    %edi,%r12d
+
+       movl    %ebx,%edi
+       addl    (%rbp),%r12d
+       xorl    %ebx,%r14d
+
+       xorl    %ecx,%edi
+       rorl    $6,%r13d
+       movl    %ecx,%eax
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%eax
+       addl    %r12d,%r8d
+       addl    %r12d,%eax
+
+       leaq    20(%rbp),%rbp
+       addl    %r14d,%eax
+       movl    32(%rsi),%r12d
+       movl    %r8d,%r13d
+       movl    %eax,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r9d,%r15d
+
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r15d
+
+       movl    %r12d,32(%rsp)
+       xorl    %eax,%r14d
+       andl    %r8d,%r15d
+
+       rorl    $5,%r13d
+       addl    %r11d,%r12d
+       xorl    %r10d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r8d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %eax,%r15d
+       addl    (%rbp),%r12d
+       xorl    %eax,%r14d
+
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       movl    %ebx,%r11d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r11d
+       addl    %r12d,%edx
+       addl    %r12d,%r11d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r11d
+       movl    36(%rsi),%r12d
+       movl    %edx,%r13d
+       movl    %r11d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r8d,%edi
+
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%edi
+
+       movl    %r12d,36(%rsp)
+       xorl    %r11d,%r14d
+       andl    %edx,%edi
+
+       rorl    $5,%r13d
+       addl    %r10d,%r12d
+       xorl    %r9d,%edi
+
+       rorl    $11,%r14d
+       xorl    %edx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r11d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r11d,%r14d
+
+       xorl    %eax,%edi
+       rorl    $6,%r13d
+       movl    %eax,%r10d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r10d
+       addl    %r12d,%ecx
+       addl    %r12d,%r10d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r10d
+       movl    40(%rsi),%r12d
+       movl    %ecx,%r13d
+       movl    %r10d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %edx,%r15d
+
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r15d
+
+       movl    %r12d,40(%rsp)
+       xorl    %r10d,%r14d
+       andl    %ecx,%r15d
+
+       rorl    $5,%r13d
+       addl    %r9d,%r12d
+       xorl    %r8d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %ecx,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r10d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r10d,%r14d
+
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       movl    %r11d,%r9d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r9d
+       addl    %r12d,%ebx
+       addl    %r12d,%r9d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r9d
+       movl    44(%rsi),%r12d
+       movl    %ebx,%r13d
+       movl    %r9d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %ecx,%edi
+
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%edi
+
+       movl    %r12d,44(%rsp)
+       xorl    %r9d,%r14d
+       andl    %ebx,%edi
+
+       rorl    $5,%r13d
+       addl    %r8d,%r12d
+       xorl    %edx,%edi
+
+       rorl    $11,%r14d
+       xorl    %ebx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r9d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r9d,%r14d
+
+       xorl    %r10d,%edi
+       rorl    $6,%r13d
+       movl    %r10d,%r8d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r8d
+       addl    %r12d,%eax
+       addl    %r12d,%r8d
+
+       leaq    20(%rbp),%rbp
+       addl    %r14d,%r8d
+       movl    48(%rsi),%r12d
+       movl    %eax,%r13d
+       movl    %r8d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %ebx,%r15d
+
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r15d
+
+       movl    %r12d,48(%rsp)
+       xorl    %r8d,%r14d
+       andl    %eax,%r15d
+
+       rorl    $5,%r13d
+       addl    %edx,%r12d
+       xorl    %ecx,%r15d
+
+       rorl    $11,%r14d
+       xorl    %eax,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r8d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r8d,%r14d
+
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       movl    %r9d,%edx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%edx
+       addl    %r12d,%r11d
+       addl    %r12d,%edx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%edx
+       movl    52(%rsi),%r12d
+       movl    %r11d,%r13d
+       movl    %edx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %eax,%edi
+
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%edi
+
+       movl    %r12d,52(%rsp)
+       xorl    %edx,%r14d
+       andl    %r11d,%edi
+
+       rorl    $5,%r13d
+       addl    %ecx,%r12d
+       xorl    %ebx,%edi
+
+       rorl    $11,%r14d
+       xorl    %r11d,%r13d
+       addl    %edi,%r12d
+
+       movl    %edx,%edi
+       addl    (%rbp),%r12d
+       xorl    %edx,%r14d
+
+       xorl    %r8d,%edi
+       rorl    $6,%r13d
+       movl    %r8d,%ecx
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%ecx
+       addl    %r12d,%r10d
+       addl    %r12d,%ecx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%ecx
+       movl    56(%rsi),%r12d
+       movl    %r10d,%r13d
+       movl    %ecx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r11d,%r15d
+
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r15d
+
+       movl    %r12d,56(%rsp)
+       xorl    %ecx,%r14d
+       andl    %r10d,%r15d
+
+       rorl    $5,%r13d
+       addl    %ebx,%r12d
+       xorl    %eax,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r10d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %ecx,%r15d
+       addl    (%rbp),%r12d
+       xorl    %ecx,%r14d
+
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       movl    %edx,%ebx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%ebx
+       addl    %r12d,%r9d
+       addl    %r12d,%ebx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%ebx
+       movl    60(%rsi),%r12d
+       movl    %r9d,%r13d
+       movl    %ebx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r10d,%edi
+
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%edi
+
+       movl    %r12d,60(%rsp)
+       xorl    %ebx,%r14d
+       andl    %r9d,%edi
+
+       rorl    $5,%r13d
+       addl    %eax,%r12d
+       xorl    %r11d,%edi
+
+       rorl    $11,%r14d
+       xorl    %r9d,%r13d
+       addl    %edi,%r12d
+
+       movl    %ebx,%edi
+       addl    (%rbp),%r12d
+       xorl    %ebx,%r14d
+
+       xorl    %ecx,%edi
+       rorl    $6,%r13d
+       movl    %ecx,%eax
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%eax
+       addl    %r12d,%r8d
+       addl    %r12d,%eax
+
+       leaq    20(%rbp),%rbp
+       jmp     .Lrounds_16_xx
+.p2align       4
+.Lrounds_16_xx:
+       movl    4(%rsp),%r13d
+       movl    56(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%eax
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    36(%rsp),%r12d
+
+       addl    0(%rsp),%r12d
+       movl    %r8d,%r13d
+       addl    %r15d,%r12d
+       movl    %eax,%r14d
+       rorl    $14,%r13d
+       movl    %r9d,%r15d
+
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r15d
+
+       movl    %r12d,0(%rsp)
+       xorl    %eax,%r14d
+       andl    %r8d,%r15d
+
+       rorl    $5,%r13d
+       addl    %r11d,%r12d
+       xorl    %r10d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r8d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %eax,%r15d
+       addl    (%rbp),%r12d
+       xorl    %eax,%r14d
+
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       movl    %ebx,%r11d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r11d
+       addl    %r12d,%edx
+       addl    %r12d,%r11d
+
+       leaq    4(%rbp),%rbp
+       movl    8(%rsp),%r13d
+       movl    60(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r11d
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    40(%rsp),%r12d
+
+       addl    4(%rsp),%r12d
+       movl    %edx,%r13d
+       addl    %edi,%r12d
+       movl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r8d,%edi
+
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%edi
+
+       movl    %r12d,4(%rsp)
+       xorl    %r11d,%r14d
+       andl    %edx,%edi
+
+       rorl    $5,%r13d
+       addl    %r10d,%r12d
+       xorl    %r9d,%edi
+
+       rorl    $11,%r14d
+       xorl    %edx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r11d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r11d,%r14d
+
+       xorl    %eax,%edi
+       rorl    $6,%r13d
+       movl    %eax,%r10d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r10d
+       addl    %r12d,%ecx
+       addl    %r12d,%r10d
+
+       leaq    4(%rbp),%rbp
+       movl    12(%rsp),%r13d
+       movl    0(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r10d
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    44(%rsp),%r12d
+
+       addl    8(%rsp),%r12d
+       movl    %ecx,%r13d
+       addl    %r15d,%r12d
+       movl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %edx,%r15d
+
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r15d
+
+       movl    %r12d,8(%rsp)
+       xorl    %r10d,%r14d
+       andl    %ecx,%r15d
+
+       rorl    $5,%r13d
+       addl    %r9d,%r12d
+       xorl    %r8d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %ecx,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r10d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r10d,%r14d
+
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       movl    %r11d,%r9d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r9d
+       addl    %r12d,%ebx
+       addl    %r12d,%r9d
+
+       leaq    4(%rbp),%rbp
+       movl    16(%rsp),%r13d
+       movl    4(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r9d
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    48(%rsp),%r12d
+
+       addl    12(%rsp),%r12d
+       movl    %ebx,%r13d
+       addl    %edi,%r12d
+       movl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %ecx,%edi
+
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%edi
+
+       movl    %r12d,12(%rsp)
+       xorl    %r9d,%r14d
+       andl    %ebx,%edi
+
+       rorl    $5,%r13d
+       addl    %r8d,%r12d
+       xorl    %edx,%edi
+
+       rorl    $11,%r14d
+       xorl    %ebx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r9d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r9d,%r14d
+
+       xorl    %r10d,%edi
+       rorl    $6,%r13d
+       movl    %r10d,%r8d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r8d
+       addl    %r12d,%eax
+       addl    %r12d,%r8d
+
+       leaq    20(%rbp),%rbp
+       movl    20(%rsp),%r13d
+       movl    8(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r8d
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    52(%rsp),%r12d
+
+       addl    16(%rsp),%r12d
+       movl    %eax,%r13d
+       addl    %r15d,%r12d
+       movl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %ebx,%r15d
+
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r15d
+
+       movl    %r12d,16(%rsp)
+       xorl    %r8d,%r14d
+       andl    %eax,%r15d
+
+       rorl    $5,%r13d
+       addl    %edx,%r12d
+       xorl    %ecx,%r15d
+
+       rorl    $11,%r14d
+       xorl    %eax,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r8d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r8d,%r14d
+
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       movl    %r9d,%edx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%edx
+       addl    %r12d,%r11d
+       addl    %r12d,%edx
+
+       leaq    4(%rbp),%rbp
+       movl    24(%rsp),%r13d
+       movl    12(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%edx
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    56(%rsp),%r12d
+
+       addl    20(%rsp),%r12d
+       movl    %r11d,%r13d
+       addl    %edi,%r12d
+       movl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %eax,%edi
+
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%edi
+
+       movl    %r12d,20(%rsp)
+       xorl    %edx,%r14d
+       andl    %r11d,%edi
+
+       rorl    $5,%r13d
+       addl    %ecx,%r12d
+       xorl    %ebx,%edi
+
+       rorl    $11,%r14d
+       xorl    %r11d,%r13d
+       addl    %edi,%r12d
+
+       movl    %edx,%edi
+       addl    (%rbp),%r12d
+       xorl    %edx,%r14d
+
+       xorl    %r8d,%edi
+       rorl    $6,%r13d
+       movl    %r8d,%ecx
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%ecx
+       addl    %r12d,%r10d
+       addl    %r12d,%ecx
+
+       leaq    4(%rbp),%rbp
+       movl    28(%rsp),%r13d
+       movl    16(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%ecx
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    60(%rsp),%r12d
+
+       addl    24(%rsp),%r12d
+       movl    %r10d,%r13d
+       addl    %r15d,%r12d
+       movl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r11d,%r15d
+
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r15d
+
+       movl    %r12d,24(%rsp)
+       xorl    %ecx,%r14d
+       andl    %r10d,%r15d
+
+       rorl    $5,%r13d
+       addl    %ebx,%r12d
+       xorl    %eax,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r10d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %ecx,%r15d
+       addl    (%rbp),%r12d
+       xorl    %ecx,%r14d
+
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       movl    %edx,%ebx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%ebx
+       addl    %r12d,%r9d
+       addl    %r12d,%ebx
+
+       leaq    4(%rbp),%rbp
+       movl    32(%rsp),%r13d
+       movl    20(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%ebx
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    0(%rsp),%r12d
+
+       addl    28(%rsp),%r12d
+       movl    %r9d,%r13d
+       addl    %edi,%r12d
+       movl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r10d,%edi
+
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%edi
+
+       movl    %r12d,28(%rsp)
+       xorl    %ebx,%r14d
+       andl    %r9d,%edi
+
+       rorl    $5,%r13d
+       addl    %eax,%r12d
+       xorl    %r11d,%edi
+
+       rorl    $11,%r14d
+       xorl    %r9d,%r13d
+       addl    %edi,%r12d
+
+       movl    %ebx,%edi
+       addl    (%rbp),%r12d
+       xorl    %ebx,%r14d
+
+       xorl    %ecx,%edi
+       rorl    $6,%r13d
+       movl    %ecx,%eax
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%eax
+       addl    %r12d,%r8d
+       addl    %r12d,%eax
+
+       leaq    20(%rbp),%rbp
+       movl    36(%rsp),%r13d
+       movl    24(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%eax
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    4(%rsp),%r12d
+
+       addl    32(%rsp),%r12d
+       movl    %r8d,%r13d
+       addl    %r15d,%r12d
+       movl    %eax,%r14d
+       rorl    $14,%r13d
+       movl    %r9d,%r15d
+
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r15d
+
+       movl    %r12d,32(%rsp)
+       xorl    %eax,%r14d
+       andl    %r8d,%r15d
+
+       rorl    $5,%r13d
+       addl    %r11d,%r12d
+       xorl    %r10d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r8d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %eax,%r15d
+       addl    (%rbp),%r12d
+       xorl    %eax,%r14d
+
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       movl    %ebx,%r11d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r11d
+       addl    %r12d,%edx
+       addl    %r12d,%r11d
+
+       leaq    4(%rbp),%rbp
+       movl    40(%rsp),%r13d
+       movl    28(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r11d
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    8(%rsp),%r12d
+
+       addl    36(%rsp),%r12d
+       movl    %edx,%r13d
+       addl    %edi,%r12d
+       movl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r8d,%edi
+
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%edi
+
+       movl    %r12d,36(%rsp)
+       xorl    %r11d,%r14d
+       andl    %edx,%edi
+
+       rorl    $5,%r13d
+       addl    %r10d,%r12d
+       xorl    %r9d,%edi
+
+       rorl    $11,%r14d
+       xorl    %edx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r11d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r11d,%r14d
+
+       xorl    %eax,%edi
+       rorl    $6,%r13d
+       movl    %eax,%r10d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r10d
+       addl    %r12d,%ecx
+       addl    %r12d,%r10d
+
+       leaq    4(%rbp),%rbp
+       movl    44(%rsp),%r13d
+       movl    32(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r10d
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    12(%rsp),%r12d
+
+       addl    40(%rsp),%r12d
+       movl    %ecx,%r13d
+       addl    %r15d,%r12d
+       movl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %edx,%r15d
+
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r15d
+
+       movl    %r12d,40(%rsp)
+       xorl    %r10d,%r14d
+       andl    %ecx,%r15d
+
+       rorl    $5,%r13d
+       addl    %r9d,%r12d
+       xorl    %r8d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %ecx,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r10d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r10d,%r14d
+
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       movl    %r11d,%r9d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r9d
+       addl    %r12d,%ebx
+       addl    %r12d,%r9d
+
+       leaq    4(%rbp),%rbp
+       movl    48(%rsp),%r13d
+       movl    36(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r9d
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    16(%rsp),%r12d
+
+       addl    44(%rsp),%r12d
+       movl    %ebx,%r13d
+       addl    %edi,%r12d
+       movl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %ecx,%edi
+
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%edi
+
+       movl    %r12d,44(%rsp)
+       xorl    %r9d,%r14d
+       andl    %ebx,%edi
+
+       rorl    $5,%r13d
+       addl    %r8d,%r12d
+       xorl    %edx,%edi
+
+       rorl    $11,%r14d
+       xorl    %ebx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r9d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r9d,%r14d
+
+       xorl    %r10d,%edi
+       rorl    $6,%r13d
+       movl    %r10d,%r8d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r8d
+       addl    %r12d,%eax
+       addl    %r12d,%r8d
+
+       leaq    20(%rbp),%rbp
+       movl    52(%rsp),%r13d
+       movl    40(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r8d
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    20(%rsp),%r12d
+
+       addl    48(%rsp),%r12d
+       movl    %eax,%r13d
+       addl    %r15d,%r12d
+       movl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %ebx,%r15d
+
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r15d
+
+       movl    %r12d,48(%rsp)
+       xorl    %r8d,%r14d
+       andl    %eax,%r15d
+
+       rorl    $5,%r13d
+       addl    %edx,%r12d
+       xorl    %ecx,%r15d
+
+       rorl    $11,%r14d
+       xorl    %eax,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r8d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r8d,%r14d
+
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       movl    %r9d,%edx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%edx
+       addl    %r12d,%r11d
+       addl    %r12d,%edx
+
+       leaq    4(%rbp),%rbp
+       movl    56(%rsp),%r13d
+       movl    44(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%edx
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    24(%rsp),%r12d
+
+       addl    52(%rsp),%r12d
+       movl    %r11d,%r13d
+       addl    %edi,%r12d
+       movl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %eax,%edi
+
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%edi
+
+       movl    %r12d,52(%rsp)
+       xorl    %edx,%r14d
+       andl    %r11d,%edi
+
+       rorl    $5,%r13d
+       addl    %ecx,%r12d
+       xorl    %ebx,%edi
+
+       rorl    $11,%r14d
+       xorl    %r11d,%r13d
+       addl    %edi,%r12d
+
+       movl    %edx,%edi
+       addl    (%rbp),%r12d
+       xorl    %edx,%r14d
+
+       xorl    %r8d,%edi
+       rorl    $6,%r13d
+       movl    %r8d,%ecx
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%ecx
+       addl    %r12d,%r10d
+       addl    %r12d,%ecx
+
+       leaq    4(%rbp),%rbp
+       movl    60(%rsp),%r13d
+       movl    48(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%ecx
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    28(%rsp),%r12d
+
+       addl    56(%rsp),%r12d
+       movl    %r10d,%r13d
+       addl    %r15d,%r12d
+       movl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r11d,%r15d
+
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r15d
+
+       movl    %r12d,56(%rsp)
+       xorl    %ecx,%r14d
+       andl    %r10d,%r15d
+
+       rorl    $5,%r13d
+       addl    %ebx,%r12d
+       xorl    %eax,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r10d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %ecx,%r15d
+       addl    (%rbp),%r12d
+       xorl    %ecx,%r14d
+
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       movl    %edx,%ebx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%ebx
+       addl    %r12d,%r9d
+       addl    %r12d,%ebx
+
+       leaq    4(%rbp),%rbp
+       movl    0(%rsp),%r13d
+       movl    52(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%ebx
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    32(%rsp),%r12d
+
+       addl    60(%rsp),%r12d
+       movl    %r9d,%r13d
+       addl    %edi,%r12d
+       movl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r10d,%edi
+
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%edi
+
+       movl    %r12d,60(%rsp)
+       xorl    %ebx,%r14d
+       andl    %r9d,%edi
+
+       rorl    $5,%r13d
+       addl    %eax,%r12d
+       xorl    %r11d,%edi
+
+       rorl    $11,%r14d
+       xorl    %r9d,%r13d
+       addl    %edi,%r12d
+
+       movl    %ebx,%edi
+       addl    (%rbp),%r12d
+       xorl    %ebx,%r14d
+
+       xorl    %ecx,%edi
+       rorl    $6,%r13d
+       movl    %ecx,%eax
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%eax
+       addl    %r12d,%r8d
+       addl    %r12d,%eax
+
+       leaq    20(%rbp),%rbp
+       cmpb    $0,3(%rbp)
+       jnz     .Lrounds_16_xx
+
+       movq    64+0(%rsp),%rdi
+       addl    %r14d,%eax
+       leaq    64(%rsi),%rsi
+
+       addl    0(%rdi),%eax
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       addl    24(%rdi),%r10d
+       addl    28(%rdi),%r11d
+
+       cmpq    64+16(%rsp),%rsi
+
+       movl    %eax,0(%rdi)
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+       jb      .Lloop
+
+       movq    88(%rsp),%rsi
+
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
+.Lepilogue:
+       movq    8(%rsp),%rdi
+       movq    16(%rsp),%rsi
+       .byte   0xf3,0xc3
+
+.LSEH_end_sha256_block_data_order:
+.p2align       6
+
+K256:
+.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long  0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long  0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long  0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long  0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte  83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.def   sha256_block_data_order_shaext; .scl 3; .type 32;       .endef
+.p2align       6
+sha256_block_data_order_shaext:
+       movq    %rdi,8(%rsp)
+       movq    %rsi,16(%rsp)
+       movq    %rsp,%rax
+.LSEH_begin_sha256_block_data_order_shaext:
+       movq    %rcx,%rdi
+       movq    %rdx,%rsi
+       movq    %r8,%rdx
+
+_shaext_shortcut:
+       leaq    -88(%rsp),%rsp
+       movaps  %xmm6,-8-80(%rax)
+       movaps  %xmm7,-8-64(%rax)
+       movaps  %xmm8,-8-48(%rax)
+       movaps  %xmm9,-8-32(%rax)
+       movaps  %xmm10,-8-16(%rax)
+.Lprologue_shaext:
+       leaq    K256+128(%rip),%rcx
+       movdqu  (%rdi),%xmm1
+       movdqu  16(%rdi),%xmm2
+       movdqa  512-128(%rcx),%xmm7
+
+       pshufd  $0x1b,%xmm1,%xmm0
+       pshufd  $0xb1,%xmm1,%xmm1
+       pshufd  $0x1b,%xmm2,%xmm2
+       movdqa  %xmm7,%xmm8
+.byte  102,15,58,15,202,8
+       punpcklqdq      %xmm0,%xmm2
+       jmp     .Loop_shaext
+
+.p2align       4
+.Loop_shaext:
+       movdqu  (%rsi),%xmm3
+       movdqu  16(%rsi),%xmm4
+       movdqu  32(%rsi),%xmm5
+.byte  102,15,56,0,223
+       movdqu  48(%rsi),%xmm6
+
+       movdqa  0-128(%rcx),%xmm0
+       paddd   %xmm3,%xmm0
+.byte  102,15,56,0,231
+       movdqa  %xmm2,%xmm10
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       nop
+       movdqa  %xmm1,%xmm9
+.byte  15,56,203,202
+
+       movdqa  32-128(%rcx),%xmm0
+       paddd   %xmm4,%xmm0
+.byte  102,15,56,0,239
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       leaq    64(%rsi),%rsi
+.byte  15,56,204,220
+.byte  15,56,203,202
+
+       movdqa  64-128(%rcx),%xmm0
+       paddd   %xmm5,%xmm0
+.byte  102,15,56,0,247
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm6,%xmm7
+.byte  102,15,58,15,253,4
+       nop
+       paddd   %xmm7,%xmm3
+.byte  15,56,204,229
+.byte  15,56,203,202
+
+       movdqa  96-128(%rcx),%xmm0
+       paddd   %xmm6,%xmm0
+.byte  15,56,205,222
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm3,%xmm7
+.byte  102,15,58,15,254,4
+       nop
+       paddd   %xmm7,%xmm4
+.byte  15,56,204,238
+.byte  15,56,203,202
+       movdqa  128-128(%rcx),%xmm0
+       paddd   %xmm3,%xmm0
+.byte  15,56,205,227
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm4,%xmm7
+.byte  102,15,58,15,251,4
+       nop
+       paddd   %xmm7,%xmm5
+.byte  15,56,204,243
+.byte  15,56,203,202
+       movdqa  160-128(%rcx),%xmm0
+       paddd   %xmm4,%xmm0
+.byte  15,56,205,236
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm5,%xmm7
+.byte  102,15,58,15,252,4
+       nop
+       paddd   %xmm7,%xmm6
+.byte  15,56,204,220
+.byte  15,56,203,202
+       movdqa  192-128(%rcx),%xmm0
+       paddd   %xmm5,%xmm0
+.byte  15,56,205,245
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm6,%xmm7
+.byte  102,15,58,15,253,4
+       nop
+       paddd   %xmm7,%xmm3
+.byte  15,56,204,229
+.byte  15,56,203,202
+       movdqa  224-128(%rcx),%xmm0
+       paddd   %xmm6,%xmm0
+.byte  15,56,205,222
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm3,%xmm7
+.byte  102,15,58,15,254,4
+       nop
+       paddd   %xmm7,%xmm4
+.byte  15,56,204,238
+.byte  15,56,203,202
+       movdqa  256-128(%rcx),%xmm0
+       paddd   %xmm3,%xmm0
+.byte  15,56,205,227
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm4,%xmm7
+.byte  102,15,58,15,251,4
+       nop
+       paddd   %xmm7,%xmm5
+.byte  15,56,204,243
+.byte  15,56,203,202
+       movdqa  288-128(%rcx),%xmm0
+       paddd   %xmm4,%xmm0
+.byte  15,56,205,236
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm5,%xmm7
+.byte  102,15,58,15,252,4
+       nop
+       paddd   %xmm7,%xmm6
+.byte  15,56,204,220
+.byte  15,56,203,202
+       movdqa  320-128(%rcx),%xmm0
+       paddd   %xmm5,%xmm0
+.byte  15,56,205,245
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm6,%xmm7
+.byte  102,15,58,15,253,4
+       nop
+       paddd   %xmm7,%xmm3
+.byte  15,56,204,229
+.byte  15,56,203,202
+       movdqa  352-128(%rcx),%xmm0
+       paddd   %xmm6,%xmm0
+.byte  15,56,205,222
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm3,%xmm7
+.byte  102,15,58,15,254,4
+       nop
+       paddd   %xmm7,%xmm4
+.byte  15,56,204,238
+.byte  15,56,203,202
+       movdqa  384-128(%rcx),%xmm0
+       paddd   %xmm3,%xmm0
+.byte  15,56,205,227
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm4,%xmm7
+.byte  102,15,58,15,251,4
+       nop
+       paddd   %xmm7,%xmm5
+.byte  15,56,204,243
+.byte  15,56,203,202
+       movdqa  416-128(%rcx),%xmm0
+       paddd   %xmm4,%xmm0
+.byte  15,56,205,236
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm5,%xmm7
+.byte  102,15,58,15,252,4
+.byte  15,56,203,202
+       paddd   %xmm7,%xmm6
+
+       movdqa  448-128(%rcx),%xmm0
+       paddd   %xmm5,%xmm0
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+.byte  15,56,205,245
+       movdqa  %xmm8,%xmm7
+.byte  15,56,203,202
+
+       movdqa  480-128(%rcx),%xmm0
+       paddd   %xmm6,%xmm0
+       nop
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       decq    %rdx
+       nop
+.byte  15,56,203,202
+
+       paddd   %xmm10,%xmm2
+       paddd   %xmm9,%xmm1
+       jnz     .Loop_shaext
+
+       pshufd  $0xb1,%xmm2,%xmm2
+       pshufd  $0x1b,%xmm1,%xmm7
+       pshufd  $0xb1,%xmm1,%xmm1
+       punpckhqdq      %xmm2,%xmm1
+.byte  102,15,58,15,215,8
+
+       movdqu  %xmm1,(%rdi)
+       movdqu  %xmm2,16(%rdi)
+       movaps  -8-80(%rax),%xmm6
+       movaps  -8-64(%rax),%xmm7
+       movaps  -8-48(%rax),%xmm8
+       movaps  -8-32(%rax),%xmm9
+       movaps  -8-16(%rax),%xmm10
+       movq    %rax,%rsp
+.Lepilogue_shaext:
+       movq    8(%rsp),%rdi
+       movq    16(%rsp),%rsi
+       .byte   0xf3,0xc3
+.LSEH_end_sha256_block_data_order_shaext:
+.def   sha256_block_data_order_ssse3;  .scl 3; .type 32;       .endef
+.p2align       6
+sha256_block_data_order_ssse3:
+       movq    %rdi,8(%rsp)
+       movq    %rsi,16(%rsp)
+       movq    %rsp,%rax
+.LSEH_begin_sha256_block_data_order_ssse3:
+       movq    %rcx,%rdi
+       movq    %rdx,%rsi
+       movq    %r8,%rdx
+
+
+.Lssse3_shortcut:
+       movq    %rsp,%rax
+
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       pushq   %r15
+
+       shlq    $4,%rdx
+       subq    $160,%rsp
+       leaq    (%rsi,%rdx,4),%rdx
+       andq    $-64,%rsp
+       movq    %rdi,64+0(%rsp)
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+       movq    %rax,88(%rsp)
+
+       movaps  %xmm6,64+32(%rsp)
+       movaps  %xmm7,64+48(%rsp)
+       movaps  %xmm8,64+64(%rsp)
+       movaps  %xmm9,64+80(%rsp)
+.Lprologue_ssse3:
+
+       movl    0(%rdi),%eax
+       movl    4(%rdi),%ebx
+       movl    8(%rdi),%ecx
+       movl    12(%rdi),%edx
+       movl    16(%rdi),%r8d
+       movl    20(%rdi),%r9d
+       movl    24(%rdi),%r10d
+       movl    28(%rdi),%r11d
+
+
+       jmp     .Lloop_ssse3
+.p2align       4
+.Lloop_ssse3:
+       movdqa  K256+512(%rip),%xmm7
+       movdqu  0(%rsi),%xmm0
+       movdqu  16(%rsi),%xmm1
+       movdqu  32(%rsi),%xmm2
+.byte  102,15,56,0,199
+       movdqu  48(%rsi),%xmm3
+       leaq    K256(%rip),%rbp
+.byte  102,15,56,0,207
+       movdqa  0(%rbp),%xmm4
+       movdqa  32(%rbp),%xmm5
+.byte  102,15,56,0,215
+       paddd   %xmm0,%xmm4
+       movdqa  64(%rbp),%xmm6
+.byte  102,15,56,0,223
+       movdqa  96(%rbp),%xmm7
+       paddd   %xmm1,%xmm5
+       paddd   %xmm2,%xmm6
+       paddd   %xmm3,%xmm7
+       movdqa  %xmm4,0(%rsp)
+       movl    %eax,%r14d
+       movdqa  %xmm5,16(%rsp)
+       movl    %ebx,%edi
+       movdqa  %xmm6,32(%rsp)
+       xorl    %ecx,%edi
+       movdqa  %xmm7,48(%rsp)
+       movl    %r8d,%r13d
+       jmp     .Lssse3_00_47
+
+.p2align       4
+.Lssse3_00_47:
+       subq    $-128,%rbp
+       rorl    $14,%r13d
+       movdqa  %xmm1,%xmm4
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       movdqa  %xmm3,%xmm7
+       rorl    $9,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+.byte  102,15,58,15,224,4
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+.byte  102,15,58,15,250,4
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm4,%xmm5
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       movdqa  %xmm4,%xmm6
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       psrld   $3,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       paddd   %xmm7,%xmm0
+       rorl    $2,%r14d
+       addl    %r11d,%edx
+       psrld   $7,%xmm6
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       pshufd  $250,%xmm3,%xmm7
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       pslld   $14,%xmm5
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       pxor    %xmm6,%xmm4
+       rorl    $9,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       psrld   $11,%xmm6
+       xorl    %r11d,%r14d
+       pxor    %xmm5,%xmm4
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       pslld   $11,%xmm5
+       addl    4(%rsp),%r10d
+       movl    %r11d,%edi
+       pxor    %xmm6,%xmm4
+       xorl    %r9d,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm7,%xmm6
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       pxor    %xmm5,%xmm4
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       psrld   $10,%xmm7
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       paddd   %xmm4,%xmm0
+       rorl    $2,%r14d
+       addl    %r10d,%ecx
+       psrlq   $17,%xmm6
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       pxor    %xmm6,%xmm7
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       rorl    $9,%r14d
+       psrlq   $2,%xmm6
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       pshufd  $128,%xmm7,%xmm7
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       movl    %r10d,%r15d
+       psrldq  $8,%xmm7
+       xorl    %r8d,%r12d
+       rorl    $11,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       rorl    $6,%r13d
+       paddd   %xmm7,%xmm0
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       pshufd  $80,%xmm0,%xmm7
+       xorl    %r11d,%edi
+       rorl    $2,%r14d
+       addl    %r9d,%ebx
+       movdqa  %xmm7,%xmm6
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       psrld   $10,%xmm7
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       psrlq   $17,%xmm6
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $9,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       psrlq   $2,%xmm6
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       pxor    %xmm6,%xmm7
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       rorl    $11,%r14d
+       pshufd  $8,%xmm7,%xmm7
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       movdqa  0(%rbp),%xmm6
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       pslldq  $8,%xmm7
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       paddd   %xmm7,%xmm0
+       rorl    $2,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       paddd   %xmm0,%xmm6
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       movdqa  %xmm6,0(%rsp)
+       rorl    $14,%r13d
+       movdqa  %xmm2,%xmm4
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       movdqa  %xmm0,%xmm7
+       rorl    $9,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+.byte  102,15,58,15,225,4
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+.byte  102,15,58,15,251,4
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm4,%xmm5
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       movdqa  %xmm4,%xmm6
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       psrld   $3,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       paddd   %xmm7,%xmm1
+       rorl    $2,%r14d
+       addl    %edx,%r11d
+       psrld   $7,%xmm6
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       pshufd  $250,%xmm0,%xmm7
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       pslld   $14,%xmm5
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       pxor    %xmm6,%xmm4
+       rorl    $9,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       psrld   $11,%xmm6
+       xorl    %edx,%r14d
+       pxor    %xmm5,%xmm4
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       pslld   $11,%xmm5
+       addl    20(%rsp),%ecx
+       movl    %edx,%edi
+       pxor    %xmm6,%xmm4
+       xorl    %ebx,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm7,%xmm6
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       pxor    %xmm5,%xmm4
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       psrld   $10,%xmm7
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       paddd   %xmm4,%xmm1
+       rorl    $2,%r14d
+       addl    %ecx,%r10d
+       psrlq   $17,%xmm6
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       pxor    %xmm6,%xmm7
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       rorl    $9,%r14d
+       psrlq   $2,%xmm6
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       pshufd  $128,%xmm7,%xmm7
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       movl    %ecx,%r15d
+       psrldq  $8,%xmm7
+       xorl    %eax,%r12d
+       rorl    $11,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       rorl    $6,%r13d
+       paddd   %xmm7,%xmm1
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       pshufd  $80,%xmm1,%xmm7
+       xorl    %edx,%edi
+       rorl    $2,%r14d
+       addl    %ebx,%r9d
+       movdqa  %xmm7,%xmm6
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       psrld   $10,%xmm7
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       psrlq   $17,%xmm6
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $9,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       psrlq   $2,%xmm6
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       pxor    %xmm6,%xmm7
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       rorl    $11,%r14d
+       pshufd  $8,%xmm7,%xmm7
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       movdqa  32(%rbp),%xmm6
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       pslldq  $8,%xmm7
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       paddd   %xmm7,%xmm1
+       rorl    $2,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       paddd   %xmm1,%xmm6
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movdqa  %xmm6,16(%rsp)
+       rorl    $14,%r13d
+       movdqa  %xmm3,%xmm4
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       movdqa  %xmm1,%xmm7
+       rorl    $9,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+.byte  102,15,58,15,226,4
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+.byte  102,15,58,15,248,4
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm4,%xmm5
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       movdqa  %xmm4,%xmm6
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       psrld   $3,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       paddd   %xmm7,%xmm2
+       rorl    $2,%r14d
+       addl    %r11d,%edx
+       psrld   $7,%xmm6
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       pshufd  $250,%xmm1,%xmm7
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       pslld   $14,%xmm5
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       pxor    %xmm6,%xmm4
+       rorl    $9,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       psrld   $11,%xmm6
+       xorl    %r11d,%r14d
+       pxor    %xmm5,%xmm4
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       pslld   $11,%xmm5
+       addl    36(%rsp),%r10d
+       movl    %r11d,%edi
+       pxor    %xmm6,%xmm4
+       xorl    %r9d,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm7,%xmm6
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       pxor    %xmm5,%xmm4
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       psrld   $10,%xmm7
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       paddd   %xmm4,%xmm2
+       rorl    $2,%r14d
+       addl    %r10d,%ecx
+       psrlq   $17,%xmm6
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       pxor    %xmm6,%xmm7
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       rorl    $9,%r14d
+       psrlq   $2,%xmm6
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       pshufd  $128,%xmm7,%xmm7
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       movl    %r10d,%r15d
+       psrldq  $8,%xmm7
+       xorl    %r8d,%r12d
+       rorl    $11,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       rorl    $6,%r13d
+       paddd   %xmm7,%xmm2
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       pshufd  $80,%xmm2,%xmm7
+       xorl    %r11d,%edi
+       rorl    $2,%r14d
+       addl    %r9d,%ebx
+       movdqa  %xmm7,%xmm6
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       psrld   $10,%xmm7
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       psrlq   $17,%xmm6
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $9,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       psrlq   $2,%xmm6
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       pxor    %xmm6,%xmm7
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       rorl    $11,%r14d
+       pshufd  $8,%xmm7,%xmm7
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       movdqa  64(%rbp),%xmm6
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       pslldq  $8,%xmm7
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       paddd   %xmm7,%xmm2
+       rorl    $2,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       paddd   %xmm2,%xmm6
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       movdqa  %xmm6,32(%rsp)
+       rorl    $14,%r13d
+       movdqa  %xmm0,%xmm4
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       movdqa  %xmm2,%xmm7
+       rorl    $9,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+.byte  102,15,58,15,227,4
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+.byte  102,15,58,15,249,4
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm4,%xmm5
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       movdqa  %xmm4,%xmm6
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       psrld   $3,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       paddd   %xmm7,%xmm3
+       rorl    $2,%r14d
+       addl    %edx,%r11d
+       psrld   $7,%xmm6
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       pshufd  $250,%xmm2,%xmm7
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       pslld   $14,%xmm5
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       pxor    %xmm6,%xmm4
+       rorl    $9,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       psrld   $11,%xmm6
+       xorl    %edx,%r14d
+       pxor    %xmm5,%xmm4
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       pslld   $11,%xmm5
+       addl    52(%rsp),%ecx
+       movl    %edx,%edi
+       pxor    %xmm6,%xmm4
+       xorl    %ebx,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm7,%xmm6
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       pxor    %xmm5,%xmm4
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       psrld   $10,%xmm7
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       paddd   %xmm4,%xmm3
+       rorl    $2,%r14d
+       addl    %ecx,%r10d
+       psrlq   $17,%xmm6
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       pxor    %xmm6,%xmm7
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       rorl    $9,%r14d
+       psrlq   $2,%xmm6
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       pshufd  $128,%xmm7,%xmm7
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       movl    %ecx,%r15d
+       psrldq  $8,%xmm7
+       xorl    %eax,%r12d
+       rorl    $11,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       rorl    $6,%r13d
+       paddd   %xmm7,%xmm3
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       pshufd  $80,%xmm3,%xmm7
+       xorl    %edx,%edi
+       rorl    $2,%r14d
+       addl    %ebx,%r9d
+       movdqa  %xmm7,%xmm6
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       psrld   $10,%xmm7
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       psrlq   $17,%xmm6
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $9,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       psrlq   $2,%xmm6
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       pxor    %xmm6,%xmm7
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       rorl    $11,%r14d
+       pshufd  $8,%xmm7,%xmm7
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       movdqa  96(%rbp),%xmm6
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       pslldq  $8,%xmm7
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       paddd   %xmm7,%xmm3
+       rorl    $2,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       paddd   %xmm3,%xmm6
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movdqa  %xmm6,48(%rsp)
+       cmpb    $0,131(%rbp)
+       jne     .Lssse3_00_47
+       rorl    $14,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       rorl    $9,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       rorl    $11,%r14d
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       rorl    $2,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       rorl    $9,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       addl    4(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       rorl    $11,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       rorl    $2,%r14d
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       rorl    $9,%r14d
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       rorl    $11,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%edi
+       rorl    $2,%r14d
+       addl    %r9d,%ebx
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       rorl    $9,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       rorl    $11,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       rorl    $2,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       rorl    $9,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       rorl    $11,%r14d
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       rorl    $2,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       rorl    $9,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       addl    20(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       rorl    $11,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       rorl    $2,%r14d
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       rorl    $9,%r14d
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       rorl    $11,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%edi
+       rorl    $2,%r14d
+       addl    %ebx,%r9d
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       rorl    $9,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       rorl    $11,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       rorl    $2,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       rorl    $9,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       rorl    $11,%r14d
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       rorl    $2,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       rorl    $9,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       addl    36(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       rorl    $11,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       rorl    $2,%r14d
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       rorl    $9,%r14d
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       rorl    $11,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%edi
+       rorl    $2,%r14d
+       addl    %r9d,%ebx
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       rorl    $9,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       rorl    $11,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       rorl    $2,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       rorl    $9,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       rorl    $11,%r14d
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       rorl    $2,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       rorl    $9,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       addl    52(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       rorl    $11,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       rorl    $2,%r14d
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       rorl    $9,%r14d
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       rorl    $11,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%edi
+       rorl    $2,%r14d
+       addl    %ebx,%r9d
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       rorl    $9,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       rorl    $11,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       rorl    $2,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movq    64+0(%rsp),%rdi
+       movl    %r14d,%eax
+
+       addl    0(%rdi),%eax
+       leaq    64(%rsi),%rsi
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       addl    24(%rdi),%r10d
+       addl    28(%rdi),%r11d
+
+       cmpq    64+16(%rsp),%rsi
+
+       movl    %eax,0(%rdi)
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+       jb      .Lloop_ssse3
+
+       movq    88(%rsp),%rsi
+
+       movaps  64+32(%rsp),%xmm6
+       movaps  64+48(%rsp),%xmm7
+       movaps  64+64(%rsp),%xmm8
+       movaps  64+80(%rsp),%xmm9
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
+.Lepilogue_ssse3:
+       movq    8(%rsp),%rdi
+       movq    16(%rsp),%rsi
+       .byte   0xf3,0xc3
+
+.LSEH_end_sha256_block_data_order_ssse3:
+.def   sha256_block_data_order_avx;    .scl 3; .type 32;       .endef
+.p2align       6
+sha256_block_data_order_avx:
+       movq    %rdi,8(%rsp)
+       movq    %rsi,16(%rsp)
+       movq    %rsp,%rax
+.LSEH_begin_sha256_block_data_order_avx:
+       movq    %rcx,%rdi
+       movq    %rdx,%rsi
+       movq    %r8,%rdx
+
+
+.Lavx_shortcut:
+       movq    %rsp,%rax
+
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       pushq   %r15
+
+       shlq    $4,%rdx
+       subq    $160,%rsp
+       leaq    (%rsi,%rdx,4),%rdx
+       andq    $-64,%rsp
+       movq    %rdi,64+0(%rsp)
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+       movq    %rax,88(%rsp)
+
+       movaps  %xmm6,64+32(%rsp)
+       movaps  %xmm7,64+48(%rsp)
+       movaps  %xmm8,64+64(%rsp)
+       movaps  %xmm9,64+80(%rsp)
+.Lprologue_avx:
+
+       vzeroupper
+       movl    0(%rdi),%eax
+       movl    4(%rdi),%ebx
+       movl    8(%rdi),%ecx
+       movl    12(%rdi),%edx
+       movl    16(%rdi),%r8d
+       movl    20(%rdi),%r9d
+       movl    24(%rdi),%r10d
+       movl    28(%rdi),%r11d
+       vmovdqa K256+512+32(%rip),%xmm8
+       vmovdqa K256+512+64(%rip),%xmm9
+       jmp     .Lloop_avx
+.p2align       4
+.Lloop_avx:
+       vmovdqa K256+512(%rip),%xmm7
+       vmovdqu 0(%rsi),%xmm0
+       vmovdqu 16(%rsi),%xmm1
+       vmovdqu 32(%rsi),%xmm2
+       vmovdqu 48(%rsi),%xmm3
+       vpshufb %xmm7,%xmm0,%xmm0
+       leaq    K256(%rip),%rbp
+       vpshufb %xmm7,%xmm1,%xmm1
+       vpshufb %xmm7,%xmm2,%xmm2
+       vpaddd  0(%rbp),%xmm0,%xmm4
+       vpshufb %xmm7,%xmm3,%xmm3
+       vpaddd  32(%rbp),%xmm1,%xmm5
+       vpaddd  64(%rbp),%xmm2,%xmm6
+       vpaddd  96(%rbp),%xmm3,%xmm7
+       vmovdqa %xmm4,0(%rsp)
+       movl    %eax,%r14d
+       vmovdqa %xmm5,16(%rsp)
+       movl    %ebx,%edi
+       vmovdqa %xmm6,32(%rsp)
+       xorl    %ecx,%edi
+       vmovdqa %xmm7,48(%rsp)
+       movl    %r8d,%r13d
+       jmp     .Lavx_00_47
+
+.p2align       4
+.Lavx_00_47:
+       subq    $-128,%rbp
+       vpalignr        $4,%xmm0,%xmm1,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       vpalignr        $4,%xmm2,%xmm3,%xmm7
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       vpaddd  %xmm7,%xmm0,%xmm0
+       xorl    %r8d,%r13d
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       xorl    %r10d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       addl    %r12d,%r11d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       vpshufd $250,%xmm3,%xmm7
+       shrdl   $2,%r14d,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       shrdl   $9,%r14d,%r14d
+       vpslld  $11,%xmm5,%xmm5
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    4(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       vpsrlq  $17,%xmm7,%xmm7
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       vpaddd  %xmm4,%xmm0,%xmm0
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %edx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r13d
+       vpshufb %xmm8,%xmm6,%xmm6
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       vpaddd  %xmm6,%xmm0,%xmm0
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       vpshufd $80,%xmm0,%xmm7
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       shrdl   $11,%r14d,%r14d
+       vpsrld  $10,%xmm7,%xmm6
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %r11d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r9d,%ebx
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       vpshufb %xmm9,%xmm6,%xmm6
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       vpaddd  %xmm6,%xmm0,%xmm0
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       vpaddd  0(%rbp),%xmm0,%xmm6
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       vmovdqa %xmm6,0(%rsp)
+       vpalignr        $4,%xmm1,%xmm2,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       vpalignr        $4,%xmm3,%xmm0,%xmm7
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       vpaddd  %xmm7,%xmm1,%xmm1
+       xorl    %eax,%r13d
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       xorl    %ecx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       addl    %r12d,%edx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       vpshufd $250,%xmm0,%xmm7
+       shrdl   $2,%r14d,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       shrdl   $9,%r14d,%r14d
+       vpslld  $11,%xmm5,%xmm5
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    20(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       vpsrlq  $17,%xmm7,%xmm7
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       vpaddd  %xmm4,%xmm1,%xmm1
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %r11d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r13d
+       vpshufb %xmm8,%xmm6,%xmm6
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       vpaddd  %xmm6,%xmm1,%xmm1
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       vpshufd $80,%xmm1,%xmm7
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       shrdl   $11,%r14d,%r14d
+       vpsrld  $10,%xmm7,%xmm6
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %edx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %ebx,%r9d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       vpshufb %xmm9,%xmm6,%xmm6
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       vpaddd  %xmm6,%xmm1,%xmm1
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       vpaddd  32(%rbp),%xmm1,%xmm6
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       vmovdqa %xmm6,16(%rsp)
+       vpalignr        $4,%xmm2,%xmm3,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       vpalignr        $4,%xmm0,%xmm1,%xmm7
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       vpaddd  %xmm7,%xmm2,%xmm2
+       xorl    %r8d,%r13d
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       xorl    %r10d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       addl    %r12d,%r11d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       vpshufd $250,%xmm1,%xmm7
+       shrdl   $2,%r14d,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       shrdl   $9,%r14d,%r14d
+       vpslld  $11,%xmm5,%xmm5
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    36(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       vpsrlq  $17,%xmm7,%xmm7
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       vpaddd  %xmm4,%xmm2,%xmm2
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %edx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r13d
+       vpshufb %xmm8,%xmm6,%xmm6
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       vpaddd  %xmm6,%xmm2,%xmm2
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       vpshufd $80,%xmm2,%xmm7
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       shrdl   $11,%r14d,%r14d
+       vpsrld  $10,%xmm7,%xmm6
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %r11d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r9d,%ebx
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       vpshufb %xmm9,%xmm6,%xmm6
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       vpaddd  %xmm6,%xmm2,%xmm2
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       vpaddd  64(%rbp),%xmm2,%xmm6
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       vmovdqa %xmm6,32(%rsp)
+       vpalignr        $4,%xmm3,%xmm0,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       vpalignr        $4,%xmm1,%xmm2,%xmm7
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       vpaddd  %xmm7,%xmm3,%xmm3
+       xorl    %eax,%r13d
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       xorl    %ecx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       addl    %r12d,%edx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       vpshufd $250,%xmm2,%xmm7
+       shrdl   $2,%r14d,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       shrdl   $9,%r14d,%r14d
+       vpslld  $11,%xmm5,%xmm5
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    52(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       vpsrlq  $17,%xmm7,%xmm7
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       vpaddd  %xmm4,%xmm3,%xmm3
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %r11d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r13d
+       vpshufb %xmm8,%xmm6,%xmm6
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       vpaddd  %xmm6,%xmm3,%xmm3
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       vpshufd $80,%xmm3,%xmm7
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       shrdl   $11,%r14d,%r14d
+       vpsrld  $10,%xmm7,%xmm6
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %edx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %ebx,%r9d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       vpshufb %xmm9,%xmm6,%xmm6
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       vpaddd  %xmm6,%xmm3,%xmm3
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       vpaddd  96(%rbp),%xmm3,%xmm6
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       vmovdqa %xmm6,48(%rsp)
+       cmpb    $0,131(%rbp)
+       jne     .Lavx_00_47
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       addl    4(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r9d,%ebx
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       addl    20(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %ebx,%r9d
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       addl    36(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r9d,%ebx
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       addl    52(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %ebx,%r9d
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movq    64+0(%rsp),%rdi
+       movl    %r14d,%eax
+
+       addl    0(%rdi),%eax
+       leaq    64(%rsi),%rsi
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       addl    24(%rdi),%r10d
+       addl    28(%rdi),%r11d
+
+       cmpq    64+16(%rsp),%rsi
+
+       movl    %eax,0(%rdi)
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+       jb      .Lloop_avx
+
+       movq    88(%rsp),%rsi
+
+       vzeroupper
+       movaps  64+32(%rsp),%xmm6
+       movaps  64+48(%rsp),%xmm7
+       movaps  64+64(%rsp),%xmm8
+       movaps  64+80(%rsp),%xmm9
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
+.Lepilogue_avx:
+       movq    8(%rsp),%rdi
+       movq    16(%rsp),%rsi
+       .byte   0xf3,0xc3
+
+.LSEH_end_sha256_block_data_order_avx:
+.def   sha256_block_data_order_avx2;   .scl 3; .type 32;       .endef
+.p2align       6
+sha256_block_data_order_avx2:
+       movq    %rdi,8(%rsp)
+       movq    %rsi,16(%rsp)
+       movq    %rsp,%rax
+.LSEH_begin_sha256_block_data_order_avx2:
+       movq    %rcx,%rdi
+       movq    %rdx,%rsi
+       movq    %r8,%rdx
+
+
+.Lavx2_shortcut:
+       movq    %rsp,%rax
+
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       pushq   %r15
+
+       subq    $608,%rsp
+       shlq    $4,%rdx
+       andq    $-1024,%rsp
+       leaq    (%rsi,%rdx,4),%rdx
+       addq    $448,%rsp
+       movq    %rdi,64+0(%rsp)
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+       movq    %rax,88(%rsp)
+
+       movaps  %xmm6,64+32(%rsp)
+       movaps  %xmm7,64+48(%rsp)
+       movaps  %xmm8,64+64(%rsp)
+       movaps  %xmm9,64+80(%rsp)
+.Lprologue_avx2:
+
+       vzeroupper
+       subq    $-64,%rsi
+       movl    0(%rdi),%eax
+       movq    %rsi,%r12
+       movl    4(%rdi),%ebx
+       cmpq    %rdx,%rsi
+       movl    8(%rdi),%ecx
+       cmoveq  %rsp,%r12
+       movl    12(%rdi),%edx
+       movl    16(%rdi),%r8d
+       movl    20(%rdi),%r9d
+       movl    24(%rdi),%r10d
+       movl    28(%rdi),%r11d
+       vmovdqa K256+512+32(%rip),%ymm8
+       vmovdqa K256+512+64(%rip),%ymm9
+       jmp     .Loop_avx2
+.p2align       4
+.Loop_avx2:
+       vmovdqa K256+512(%rip),%ymm7
+       vmovdqu -64+0(%rsi),%xmm0
+       vmovdqu -64+16(%rsi),%xmm1
+       vmovdqu -64+32(%rsi),%xmm2
+       vmovdqu -64+48(%rsi),%xmm3
+
+       vinserti128     $1,(%r12),%ymm0,%ymm0
+       vinserti128     $1,16(%r12),%ymm1,%ymm1
+       vpshufb %ymm7,%ymm0,%ymm0
+       vinserti128     $1,32(%r12),%ymm2,%ymm2
+       vpshufb %ymm7,%ymm1,%ymm1
+       vinserti128     $1,48(%r12),%ymm3,%ymm3
+
+       leaq    K256(%rip),%rbp
+       vpshufb %ymm7,%ymm2,%ymm2
+       vpaddd  0(%rbp),%ymm0,%ymm4
+       vpshufb %ymm7,%ymm3,%ymm3
+       vpaddd  32(%rbp),%ymm1,%ymm5
+       vpaddd  64(%rbp),%ymm2,%ymm6
+       vpaddd  96(%rbp),%ymm3,%ymm7
+       vmovdqa %ymm4,0(%rsp)
+       xorl    %r14d,%r14d
+       vmovdqa %ymm5,32(%rsp)
+       leaq    -64(%rsp),%rsp
+       movl    %ebx,%edi
+       vmovdqa %ymm6,0(%rsp)
+       xorl    %ecx,%edi
+       vmovdqa %ymm7,32(%rsp)
+       movl    %r9d,%r12d
+       subq    $-32*4,%rbp
+       jmp     .Lavx2_00_47
+
+.p2align       4
+.Lavx2_00_47:
+       leaq    -64(%rsp),%rsp
+       vpalignr        $4,%ymm0,%ymm1,%ymm4
+       addl    0+128(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       vpalignr        $4,%ymm2,%ymm3,%ymm7
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       vpaddd  %ymm7,%ymm0,%ymm0
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       vpshufd $250,%ymm3,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    4+128(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       vpaddd  %ymm4,%ymm0,%ymm0
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    8+128(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       vpshufb %ymm8,%ymm6,%ymm6
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       vpaddd  %ymm6,%ymm0,%ymm0
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       vpshufd $80,%ymm0,%ymm7
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       vpsrld  $10,%ymm7,%ymm6
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       vpsrlq  $17,%ymm7,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    12+128(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       vpshufb %ymm9,%ymm6,%ymm6
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       vpaddd  %ymm6,%ymm0,%ymm0
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       vpaddd  0(%rbp),%ymm0,%ymm6
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       vmovdqa %ymm6,0(%rsp)
+       vpalignr        $4,%ymm1,%ymm2,%ymm4
+       addl    32+128(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       vpalignr        $4,%ymm3,%ymm0,%ymm7
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       vpaddd  %ymm7,%ymm1,%ymm1
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       vpshufd $250,%ymm0,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    36+128(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       vpaddd  %ymm4,%ymm1,%ymm1
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    40+128(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       vpshufb %ymm8,%ymm6,%ymm6
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       vpaddd  %ymm6,%ymm1,%ymm1
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       vpshufd $80,%ymm1,%ymm7
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       vpsrld  $10,%ymm7,%ymm6
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       vpsrlq  $17,%ymm7,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    44+128(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       vpshufb %ymm9,%ymm6,%ymm6
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       vpaddd  %ymm6,%ymm1,%ymm1
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       vpaddd  32(%rbp),%ymm1,%ymm6
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       vmovdqa %ymm6,32(%rsp)
+       leaq    -64(%rsp),%rsp
+       vpalignr        $4,%ymm2,%ymm3,%ymm4
+       addl    0+128(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       vpalignr        $4,%ymm0,%ymm1,%ymm7
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       vpaddd  %ymm7,%ymm2,%ymm2
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       vpshufd $250,%ymm1,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    4+128(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       vpaddd  %ymm4,%ymm2,%ymm2
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    8+128(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       vpshufb %ymm8,%ymm6,%ymm6
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       vpaddd  %ymm6,%ymm2,%ymm2
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       vpshufd $80,%ymm2,%ymm7
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       vpsrld  $10,%ymm7,%ymm6
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       vpsrlq  $17,%ymm7,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    12+128(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       vpshufb %ymm9,%ymm6,%ymm6
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       vpaddd  %ymm6,%ymm2,%ymm2
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       vpaddd  64(%rbp),%ymm2,%ymm6
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       vmovdqa %ymm6,0(%rsp)
+       vpalignr        $4,%ymm3,%ymm0,%ymm4
+       addl    32+128(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       vpalignr        $4,%ymm1,%ymm2,%ymm7
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       vpaddd  %ymm7,%ymm3,%ymm3
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       vpshufd $250,%ymm2,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    36+128(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       vpaddd  %ymm4,%ymm3,%ymm3
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    40+128(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       vpshufb %ymm8,%ymm6,%ymm6
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       vpaddd  %ymm6,%ymm3,%ymm3
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       vpshufd $80,%ymm3,%ymm7
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       vpsrld  $10,%ymm7,%ymm6
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       vpsrlq  $17,%ymm7,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    44+128(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       vpshufb %ymm9,%ymm6,%ymm6
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       vpaddd  %ymm6,%ymm3,%ymm3
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       vpaddd  96(%rbp),%ymm3,%ymm6
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       vmovdqa %ymm6,32(%rsp)
+       leaq    128(%rbp),%rbp
+       cmpb    $0,3(%rbp)
+       jne     .Lavx2_00_47
+       addl    0+64(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       addl    4+64(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       addl    8+64(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       addl    12+64(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       addl    32+64(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       addl    36+64(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       addl    40+64(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       addl    44+64(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       addl    0(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       addl    4(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       addl    8(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       addl    12(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       addl    32(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       addl    36(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       addl    40(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       addl    44(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       movq    512(%rsp),%rdi
+       addl    %r14d,%eax
+
+       leaq    448(%rsp),%rbp
+
+       addl    0(%rdi),%eax
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       addl    24(%rdi),%r10d
+       addl    28(%rdi),%r11d
+
+       movl    %eax,0(%rdi)
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+
+       cmpq    80(%rbp),%rsi
+       je      .Ldone_avx2
+
+       xorl    %r14d,%r14d
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       movl    %r9d,%r12d
+       jmp     .Lower_avx2
+.p2align       4
+.Lower_avx2:
+       addl    0+16(%rbp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       addl    4+16(%rbp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       addl    8+16(%rbp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       addl    12+16(%rbp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       addl    32+16(%rbp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       addl    36+16(%rbp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       addl    40+16(%rbp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       addl    44+16(%rbp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       leaq    -64(%rbp),%rbp
+       cmpq    %rsp,%rbp
+       jae     .Lower_avx2
+
+       movq    512(%rsp),%rdi
+       addl    %r14d,%eax
+
+       leaq    448(%rsp),%rsp
+
+       addl    0(%rdi),%eax
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       leaq    128(%rsi),%rsi
+       addl    24(%rdi),%r10d
+       movq    %rsi,%r12
+       addl    28(%rdi),%r11d
+       cmpq    64+16(%rsp),%rsi
+
+       movl    %eax,0(%rdi)
+       cmoveq  %rsp,%r12
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+
+       jbe     .Loop_avx2
+       leaq    (%rsp),%rbp
+
+.Ldone_avx2:
+       leaq    (%rbp),%rsp
+       movq    88(%rsp),%rsi
+
+       vzeroupper
+       movaps  64+32(%rsp),%xmm6
+       movaps  64+48(%rsp),%xmm7
+       movaps  64+64(%rsp),%xmm8
+       movaps  64+80(%rsp),%xmm9
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
+.Lepilogue_avx2:
+       movq    8(%rsp),%rdi
+       movq    16(%rsp),%rsi
+       .byte   0xf3,0xc3
+
+.LSEH_end_sha256_block_data_order_avx2:
+
+.def   se_handler;     .scl 3; .type 32;       .endef
+.p2align       4
+se_handler:
+       pushq   %rsi
+       pushq   %rdi
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       pushfq
+       subq    $64,%rsp
+
+       movq    120(%r8),%rax
+       movq    248(%r8),%rbx
+
+       movq    8(%r9),%rsi
+       movq    56(%r9),%r11
+
+       movl    0(%r11),%r10d
+       leaq    (%rsi,%r10,1),%r10
+       cmpq    %r10,%rbx
+       jb      .Lin_prologue
+
+       movq    152(%r8),%rax
+
+       movl    4(%r11),%r10d
+       leaq    (%rsi,%r10,1),%r10
+       cmpq    %r10,%rbx
+       jae     .Lin_prologue
+       leaq    .Lavx2_shortcut(%rip),%r10
+       cmpq    %r10,%rbx
+       jb      .Lnot_in_avx2
+
+       andq    $-1024,%rax
+       addq    $448,%rax
+.Lnot_in_avx2:
+       movq    %rax,%rsi
+       movq    64+24(%rax),%rax
+
+       movq    -8(%rax),%rbx
+       movq    -16(%rax),%rbp
+       movq    -24(%rax),%r12
+       movq    -32(%rax),%r13
+       movq    -40(%rax),%r14
+       movq    -48(%rax),%r15
+       movq    %rbx,144(%r8)
+       movq    %rbp,160(%r8)
+       movq    %r12,216(%r8)
+       movq    %r13,224(%r8)
+       movq    %r14,232(%r8)
+       movq    %r15,240(%r8)
+
+       leaq    .Lepilogue(%rip),%r10
+       cmpq    %r10,%rbx
+       jb      .Lin_prologue
+
+       leaq    64+32(%rsi),%rsi
+       leaq    512(%r8),%rdi
+       movl    $8,%ecx
+.long  0xa548f3fc
+
+.Lin_prologue:
+       movq    8(%rax),%rdi
+       movq    16(%rax),%rsi
+       movq    %rax,152(%r8)
+       movq    %rsi,168(%r8)
+       movq    %rdi,176(%r8)
+
+       movq    40(%r9),%rdi
+       movq    %r8,%rsi
+       movl    $154,%ecx
+.long  0xa548f3fc
+
+       movq    %r9,%rsi
+       xorq    %rcx,%rcx
+       movq    8(%rsi),%rdx
+       movq    0(%rsi),%r8
+       movq    16(%rsi),%r9
+       movq    40(%rsi),%r10
+       leaq    56(%rsi),%r11
+       leaq    24(%rsi),%r12
+       movq    %r10,32(%rsp)
+       movq    %r11,40(%rsp)
+       movq    %r12,48(%rsp)
+       movq    %rcx,56(%rsp)
+       call    *__imp_RtlVirtualUnwind(%rip)
+
+       movl    $1,%eax
+       addq    $64,%rsp
+       popfq
+       popq    %r15
+       popq    %r14
+       popq    %r13
+       popq    %r12
+       popq    %rbp
+       popq    %rbx
+       popq    %rdi
+       popq    %rsi
+       .byte   0xf3,0xc3
+
+.def   shaext_handler; .scl 3; .type 32;       .endef
+.p2align       4
+shaext_handler:
+       pushq   %rsi
+       pushq   %rdi
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       pushfq
+       subq    $64,%rsp
+
+       movq    120(%r8),%rax
+       movq    248(%r8),%rbx
+
+       leaq    .Lprologue_shaext(%rip),%r10
+       cmpq    %r10,%rbx
+       jb      .Lin_prologue
+
+       leaq    .Lepilogue_shaext(%rip),%r10
+       cmpq    %r10,%rbx
+       jae     .Lin_prologue
+
+       leaq    -8-80(%rax),%rsi
+       leaq    512(%r8),%rdi
+       movl    $10,%ecx
+.long  0xa548f3fc
+
+       jmp     .Lin_prologue
+
+.section       .pdata
+.p2align       2
+.rva   .LSEH_begin_sha256_block_data_order
+.rva   .LSEH_end_sha256_block_data_order
+.rva   .LSEH_info_sha256_block_data_order
+.rva   .LSEH_begin_sha256_block_data_order_shaext
+.rva   .LSEH_end_sha256_block_data_order_shaext
+.rva   .LSEH_info_sha256_block_data_order_shaext
+.rva   .LSEH_begin_sha256_block_data_order_ssse3
+.rva   .LSEH_end_sha256_block_data_order_ssse3
+.rva   .LSEH_info_sha256_block_data_order_ssse3
+.rva   .LSEH_begin_sha256_block_data_order_avx
+.rva   .LSEH_end_sha256_block_data_order_avx
+.rva   .LSEH_info_sha256_block_data_order_avx
+.rva   .LSEH_begin_sha256_block_data_order_avx2
+.rva   .LSEH_end_sha256_block_data_order_avx2
+.rva   .LSEH_info_sha256_block_data_order_avx2
+.section       .xdata
+.p2align       3
+.LSEH_info_sha256_block_data_order:
+.byte  9,0,0,0
+.rva   se_handler
+.rva   .Lprologue,.Lepilogue
+.LSEH_info_sha256_block_data_order_shaext:
+.byte  9,0,0,0
+.rva   shaext_handler
+.LSEH_info_sha256_block_data_order_ssse3:
+.byte  9,0,0,0
+.rva   se_handler
+.rva   .Lprologue_ssse3,.Lepilogue_ssse3
+.LSEH_info_sha256_block_data_order_avx:
+.byte  9,0,0,0
+.rva   se_handler
+.rva   .Lprologue_avx,.Lepilogue_avx
+.LSEH_info_sha256_block_data_order_avx2:
+.byte  9,0,0,0
+.rva   se_handler
+.rva   .Lprologue_avx2,.Lepilogue_avx2
+
index acad0ec1e7b0f70c2260c8a7702969e197926cf1..72a7f73d77ac2ea5a5129dfb182da8819346016f 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,6 @@
 #
 # *** This file is auto-generated ***
 #
-.file  "sha512-586.s"
 .text
 .globl _sha512_block_data_order
 .def   _sha512_block_data_order;       .scl    2;      .type   32;     .endef
@@ -594,6 +593,8 @@ _sha512_block_data_order:
 .long  4234509866,1501505948
 .long  987167468,1607167915
 .long  1246189591,1816402316
+.long  67438087,66051
+.long  202182159,134810123
 .byte  83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
 .byte  110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
 .byte  67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
index 034dab2388d3feff0a2e41f8e4f71b50715a43c8..419fa2a9803366509b8b36e1a85f6810d3565a0c 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 .text  
 
 
-.globl sha256_block_data_order
-.def   sha256_block_data_order;        .scl 2; .type 32;       .endef
+.globl sha512_block_data_order
+.def   sha512_block_data_order;        .scl 2; .type 32;       .endef
 .p2align       4
-sha256_block_data_order:
+sha512_block_data_order:
        movq    %rdi,8(%rsp)
        movq    %rsi,16(%rsp)
        movq    %rsp,%rax
-.LSEH_begin_sha256_block_data_order:
+.LSEH_begin_sha512_block_data_order:
        movq    %rcx,%rdi
        movq    %rdx,%rsi
        movq    %r8,%rdx
 
+
        leaq    _gnutls_x86_cpuid_s(%rip),%r11
        movl    0(%r11),%r9d
        movl    4(%r11),%r10d
        movl    8(%r11),%r11d
-       testl   $512,%r10d
-       jnz     .Lssse3_shortcut
+       testl   $2048,%r10d
+       jnz     .Lxop_shortcut
+       andl    $296,%r11d
+       cmpl    $296,%r11d
+       je      .Lavx2_shortcut
+       andl    $1073741824,%r9d
+       andl    $268435968,%r10d
+       orl     %r9d,%r10d
+       cmpl    $1342177792,%r10d
+       je      .Lavx_shortcut
+       movq    %rsp,%rax
+
        pushq   %rbx
+
        pushq   %rbp
+
        pushq   %r12
+
        pushq   %r13
+
        pushq   %r14
+
        pushq   %r15
-       movq    %rsp,%r11
+
        shlq    $4,%rdx
-       subq    $64+32,%rsp
-       leaq    (%rsi,%rdx,4),%rdx
+       subq    $128+32,%rsp
+       leaq    (%rsi,%rdx,8),%rdx
        andq    $-64,%rsp
-       movq    %rdi,64+0(%rsp)
-       movq    %rsi,64+8(%rsp)
-       movq    %rdx,64+16(%rsp)
-       movq    %r11,64+24(%rsp)
+       movq    %rdi,128+0(%rsp)
+       movq    %rsi,128+8(%rsp)
+       movq    %rdx,128+16(%rsp)
+       movq    %rax,152(%rsp)
+
 .Lprologue:
 
-       movl    0(%rdi),%eax
-       movl    4(%rdi),%ebx
-       movl    8(%rdi),%ecx
-       movl    12(%rdi),%edx
-       movl    16(%rdi),%r8d
-       movl    20(%rdi),%r9d
-       movl    24(%rdi),%r10d
-       movl    28(%rdi),%r11d
+       movq    0(%rdi),%rax
+       movq    8(%rdi),%rbx
+       movq    16(%rdi),%rcx
+       movq    24(%rdi),%rdx
+       movq    32(%rdi),%r8
+       movq    40(%rdi),%r9
+       movq    48(%rdi),%r10
+       movq    56(%rdi),%r11
        jmp     .Lloop
 
 .p2align       4
 .Lloop:
-       movl    %ebx,%edi
-       leaq    K256(%rip),%rbp
-       xorl    %ecx,%edi
-       movl    0(%rsi),%r12d
-       movl    %r8d,%r13d
-       movl    %eax,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r9d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r8d,%r13d
-       xorl    %r10d,%r15d
-
-       movl    %r12d,0(%rsp)
-       xorl    %eax,%r14d
-       andl    %r8d,%r15d
-
-       rorl    $5,%r13d
-       addl    %r11d,%r12d
-       xorl    %r10d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r8d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %eax,%r15d
-       addl    (%rbp),%r12d
-       xorl    %eax,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ebx,%r15d
-       movl    %ebx,%r11d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r11d
-       addl    %r12d,%edx
-       addl    %r12d,%r11d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r11d
-
-       movl    4(%rsi),%r12d
-       movl    %edx,%r13d
-       movl    %r11d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r8d,%edi
-
-       rorl    $9,%r14d
-       xorl    %edx,%r13d
-       xorl    %r9d,%edi
-
-       movl    %r12d,4(%rsp)
-       xorl    %r11d,%r14d
-       andl    %edx,%edi
-
-       rorl    $5,%r13d
-       addl    %r10d,%r12d
-       xorl    %r9d,%edi
-
-       rorl    $11,%r14d
-       xorl    %edx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r11d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r11d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %eax,%edi
-       movl    %eax,%r10d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r10d
-       addl    %r12d,%ecx
-       addl    %r12d,%r10d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r10d
-
-       movl    8(%rsi),%r12d
-       movl    %ecx,%r13d
-       movl    %r10d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %edx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %ecx,%r13d
-       xorl    %r8d,%r15d
-
-       movl    %r12d,8(%rsp)
-       xorl    %r10d,%r14d
-       andl    %ecx,%r15d
-
-       rorl    $5,%r13d
-       addl    %r9d,%r12d
-       xorl    %r8d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %ecx,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r10d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r10d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r11d,%r15d
-       movl    %r11d,%r9d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r9d
-       addl    %r12d,%ebx
-       addl    %r12d,%r9d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r9d
-
-       movl    12(%rsi),%r12d
-       movl    %ebx,%r13d
-       movl    %r9d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %ecx,%edi
-
-       rorl    $9,%r14d
-       xorl    %ebx,%r13d
-       xorl    %edx,%edi
-
-       movl    %r12d,12(%rsp)
-       xorl    %r9d,%r14d
-       andl    %ebx,%edi
-
-       rorl    $5,%r13d
-       addl    %r8d,%r12d
-       xorl    %edx,%edi
-
-       rorl    $11,%r14d
-       xorl    %ebx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r9d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r9d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r10d,%edi
-       movl    %r10d,%r8d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r8d
-       addl    %r12d,%eax
-       addl    %r12d,%r8d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%r8d
-
-       movl    16(%rsi),%r12d
-       movl    %eax,%r13d
-       movl    %r8d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %ebx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %eax,%r13d
-       xorl    %ecx,%r15d
-
-       movl    %r12d,16(%rsp)
-       xorl    %r8d,%r14d
-       andl    %eax,%r15d
-
-       rorl    $5,%r13d
-       addl    %edx,%r12d
-       xorl    %ecx,%r15d
-
-       rorl    $11,%r14d
-       xorl    %eax,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r8d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r8d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r9d,%r15d
-       movl    %r9d,%edx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%edx
-       addl    %r12d,%r11d
-       addl    %r12d,%edx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%edx
-
-       movl    20(%rsi),%r12d
-       movl    %r11d,%r13d
-       movl    %edx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %eax,%edi
-
-       rorl    $9,%r14d
-       xorl    %r11d,%r13d
-       xorl    %ebx,%edi
-
-       movl    %r12d,20(%rsp)
-       xorl    %edx,%r14d
-       andl    %r11d,%edi
-
-       rorl    $5,%r13d
-       addl    %ecx,%r12d
-       xorl    %ebx,%edi
-
-       rorl    $11,%r14d
-       xorl    %r11d,%r13d
-       addl    %edi,%r12d
-
-       movl    %edx,%edi
-       addl    (%rbp),%r12d
-       xorl    %edx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r8d,%edi
-       movl    %r8d,%ecx
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%ecx
-       addl    %r12d,%r10d
-       addl    %r12d,%ecx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ecx
-
-       movl    24(%rsi),%r12d
-       movl    %r10d,%r13d
-       movl    %ecx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r11d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r10d,%r13d
-       xorl    %eax,%r15d
-
-       movl    %r12d,24(%rsp)
-       xorl    %ecx,%r14d
-       andl    %r10d,%r15d
-
-       rorl    $5,%r13d
-       addl    %ebx,%r12d
-       xorl    %eax,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r10d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %ecx,%r15d
-       addl    (%rbp),%r12d
-       xorl    %ecx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %edx,%r15d
-       movl    %edx,%ebx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%ebx
-       addl    %r12d,%r9d
-       addl    %r12d,%ebx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ebx
-
-       movl    28(%rsi),%r12d
-       movl    %r9d,%r13d
-       movl    %ebx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r10d,%edi
-
-       rorl    $9,%r14d
-       xorl    %r9d,%r13d
-       xorl    %r11d,%edi
-
-       movl    %r12d,28(%rsp)
-       xorl    %ebx,%r14d
-       andl    %r9d,%edi
-
-       rorl    $5,%r13d
-       addl    %eax,%r12d
-       xorl    %r11d,%edi
-
-       rorl    $11,%r14d
-       xorl    %r9d,%r13d
-       addl    %edi,%r12d
-
-       movl    %ebx,%edi
-       addl    (%rbp),%r12d
-       xorl    %ebx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ecx,%edi
-       movl    %ecx,%eax
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%eax
-       addl    %r12d,%r8d
-       addl    %r12d,%eax
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%eax
-
-       movl    32(%rsi),%r12d
-       movl    %r8d,%r13d
-       movl    %eax,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r9d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r8d,%r13d
-       xorl    %r10d,%r15d
-
-       movl    %r12d,32(%rsp)
-       xorl    %eax,%r14d
-       andl    %r8d,%r15d
-
-       rorl    $5,%r13d
-       addl    %r11d,%r12d
-       xorl    %r10d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r8d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %eax,%r15d
-       addl    (%rbp),%r12d
-       xorl    %eax,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ebx,%r15d
-       movl    %ebx,%r11d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r11d
-       addl    %r12d,%edx
-       addl    %r12d,%r11d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r11d
-
-       movl    36(%rsi),%r12d
-       movl    %edx,%r13d
-       movl    %r11d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r8d,%edi
-
-       rorl    $9,%r14d
-       xorl    %edx,%r13d
-       xorl    %r9d,%edi
-
-       movl    %r12d,36(%rsp)
-       xorl    %r11d,%r14d
-       andl    %edx,%edi
-
-       rorl    $5,%r13d
-       addl    %r10d,%r12d
-       xorl    %r9d,%edi
-
-       rorl    $11,%r14d
-       xorl    %edx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r11d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r11d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %eax,%edi
-       movl    %eax,%r10d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r10d
-       addl    %r12d,%ecx
-       addl    %r12d,%r10d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r10d
-
-       movl    40(%rsi),%r12d
-       movl    %ecx,%r13d
-       movl    %r10d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %edx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %ecx,%r13d
-       xorl    %r8d,%r15d
-
-       movl    %r12d,40(%rsp)
-       xorl    %r10d,%r14d
-       andl    %ecx,%r15d
-
-       rorl    $5,%r13d
-       addl    %r9d,%r12d
-       xorl    %r8d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %ecx,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r10d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r10d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r11d,%r15d
-       movl    %r11d,%r9d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r9d
-       addl    %r12d,%ebx
-       addl    %r12d,%r9d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r9d
-
-       movl    44(%rsi),%r12d
-       movl    %ebx,%r13d
-       movl    %r9d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %ecx,%edi
-
-       rorl    $9,%r14d
-       xorl    %ebx,%r13d
-       xorl    %edx,%edi
-
-       movl    %r12d,44(%rsp)
-       xorl    %r9d,%r14d
-       andl    %ebx,%edi
-
-       rorl    $5,%r13d
-       addl    %r8d,%r12d
-       xorl    %edx,%edi
-
-       rorl    $11,%r14d
-       xorl    %ebx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r9d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r9d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r10d,%edi
-       movl    %r10d,%r8d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r8d
-       addl    %r12d,%eax
-       addl    %r12d,%r8d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%r8d
-
-       movl    48(%rsi),%r12d
-       movl    %eax,%r13d
-       movl    %r8d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %ebx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %eax,%r13d
-       xorl    %ecx,%r15d
-
-       movl    %r12d,48(%rsp)
-       xorl    %r8d,%r14d
-       andl    %eax,%r15d
-
-       rorl    $5,%r13d
-       addl    %edx,%r12d
-       xorl    %ecx,%r15d
-
-       rorl    $11,%r14d
-       xorl    %eax,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r8d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r8d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r9d,%r15d
-       movl    %r9d,%edx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%edx
-       addl    %r12d,%r11d
-       addl    %r12d,%edx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%edx
-
-       movl    52(%rsi),%r12d
-       movl    %r11d,%r13d
-       movl    %edx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %eax,%edi
-
-       rorl    $9,%r14d
-       xorl    %r11d,%r13d
-       xorl    %ebx,%edi
-
-       movl    %r12d,52(%rsp)
-       xorl    %edx,%r14d
-       andl    %r11d,%edi
-
-       rorl    $5,%r13d
-       addl    %ecx,%r12d
-       xorl    %ebx,%edi
-
-       rorl    $11,%r14d
-       xorl    %r11d,%r13d
-       addl    %edi,%r12d
-
-       movl    %edx,%edi
-       addl    (%rbp),%r12d
-       xorl    %edx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r8d,%edi
-       movl    %r8d,%ecx
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%ecx
-       addl    %r12d,%r10d
-       addl    %r12d,%ecx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ecx
-
-       movl    56(%rsi),%r12d
-       movl    %r10d,%r13d
-       movl    %ecx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r11d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r10d,%r13d
-       xorl    %eax,%r15d
-
-       movl    %r12d,56(%rsp)
-       xorl    %ecx,%r14d
-       andl    %r10d,%r15d
-
-       rorl    $5,%r13d
-       addl    %ebx,%r12d
-       xorl    %eax,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r10d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %ecx,%r15d
-       addl    (%rbp),%r12d
-       xorl    %ecx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %edx,%r15d
-       movl    %edx,%ebx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%ebx
-       addl    %r12d,%r9d
-       addl    %r12d,%ebx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ebx
-
-       movl    60(%rsi),%r12d
-       movl    %r9d,%r13d
-       movl    %ebx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r10d,%edi
-
-       rorl    $9,%r14d
-       xorl    %r9d,%r13d
-       xorl    %r11d,%edi
-
-       movl    %r12d,60(%rsp)
-       xorl    %ebx,%r14d
-       andl    %r9d,%edi
-
-       rorl    $5,%r13d
-       addl    %eax,%r12d
-       xorl    %r11d,%edi
-
-       rorl    $11,%r14d
-       xorl    %r9d,%r13d
-       addl    %edi,%r12d
-
-       movl    %ebx,%edi
-       addl    (%rbp),%r12d
-       xorl    %ebx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ecx,%edi
-       movl    %ecx,%eax
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%eax
-       addl    %r12d,%r8d
-       addl    %r12d,%eax
-       movl    4(%rsp),%r13d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%eax
+       movq    %rbx,%rdi
+       leaq    K512(%rip),%rbp
+       xorq    %rcx,%rdi
+       movq    0(%rsi),%r12
+       movq    %r8,%r13
+       movq    %rax,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r9,%r15
+
+       xorq    %r8,%r13
+       rorq    $5,%r14
+       xorq    %r10,%r15
+
+       movq    %r12,0(%rsp)
+       xorq    %rax,%r14
+       andq    %r8,%r15
+
+       rorq    $4,%r13
+       addq    %r11,%r12
+       xorq    %r10,%r15
+
+       rorq    $6,%r14
+       xorq    %r8,%r13
+       addq    %r15,%r12
+
+       movq    %rax,%r15
+       addq    (%rbp),%r12
+       xorq    %rax,%r14
+
+       xorq    %rbx,%r15
+       rorq    $14,%r13
+       movq    %rbx,%r11
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r11
+       addq    %r12,%rdx
+       addq    %r12,%r11
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%r11
+       movq    8(%rsi),%r12
+       movq    %rdx,%r13
+       movq    %r11,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r8,%rdi
+
+       xorq    %rdx,%r13
+       rorq    $5,%r14
+       xorq    %r9,%rdi
+
+       movq    %r12,8(%rsp)
+       xorq    %r11,%r14
+       andq    %rdx,%rdi
+
+       rorq    $4,%r13
+       addq    %r10,%r12
+       xorq    %r9,%rdi
+
+       rorq    $6,%r14
+       xorq    %rdx,%r13
+       addq    %rdi,%r12
+
+       movq    %r11,%rdi
+       addq    (%rbp),%r12
+       xorq    %r11,%r14
+
+       xorq    %rax,%rdi
+       rorq    $14,%r13
+       movq    %rax,%r10
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r10
+       addq    %r12,%rcx
+       addq    %r12,%r10
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%r10
+       movq    16(%rsi),%r12
+       movq    %rcx,%r13
+       movq    %r10,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rdx,%r15
+
+       xorq    %rcx,%r13
+       rorq    $5,%r14
+       xorq    %r8,%r15
+
+       movq    %r12,16(%rsp)
+       xorq    %r10,%r14
+       andq    %rcx,%r15
+
+       rorq    $4,%r13
+       addq    %r9,%r12
+       xorq    %r8,%r15
+
+       rorq    $6,%r14
+       xorq    %rcx,%r13
+       addq    %r15,%r12
+
+       movq    %r10,%r15
+       addq    (%rbp),%r12
+       xorq    %r10,%r14
+
+       xorq    %r11,%r15
+       rorq    $14,%r13
+       movq    %r11,%r9
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r9
+       addq    %r12,%rbx
+       addq    %r12,%r9
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%r9
+       movq    24(%rsi),%r12
+       movq    %rbx,%r13
+       movq    %r9,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rcx,%rdi
 
+       xorq    %rbx,%r13
+       rorq    $5,%r14
+       xorq    %rdx,%rdi
+
+       movq    %r12,24(%rsp)
+       xorq    %r9,%r14
+       andq    %rbx,%rdi
+
+       rorq    $4,%r13
+       addq    %r8,%r12
+       xorq    %rdx,%rdi
+
+       rorq    $6,%r14
+       xorq    %rbx,%r13
+       addq    %rdi,%r12
+
+       movq    %r9,%rdi
+       addq    (%rbp),%r12
+       xorq    %r9,%r14
+
+       xorq    %r10,%rdi
+       rorq    $14,%r13
+       movq    %r10,%r8
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r8
+       addq    %r12,%rax
+       addq    %r12,%r8
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%r8
+       movq    32(%rsi),%r12
+       movq    %rax,%r13
+       movq    %r8,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rbx,%r15
+
+       xorq    %rax,%r13
+       rorq    $5,%r14
+       xorq    %rcx,%r15
+
+       movq    %r12,32(%rsp)
+       xorq    %r8,%r14
+       andq    %rax,%r15
+
+       rorq    $4,%r13
+       addq    %rdx,%r12
+       xorq    %rcx,%r15
+
+       rorq    $6,%r14
+       xorq    %rax,%r13
+       addq    %r15,%r12
+
+       movq    %r8,%r15
+       addq    (%rbp),%r12
+       xorq    %r8,%r14
+
+       xorq    %r9,%r15
+       rorq    $14,%r13
+       movq    %r9,%rdx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rdx
+       addq    %r12,%r11
+       addq    %r12,%rdx
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%rdx
+       movq    40(%rsi),%r12
+       movq    %r11,%r13
+       movq    %rdx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rax,%rdi
+
+       xorq    %r11,%r13
+       rorq    $5,%r14
+       xorq    %rbx,%rdi
+
+       movq    %r12,40(%rsp)
+       xorq    %rdx,%r14
+       andq    %r11,%rdi
+
+       rorq    $4,%r13
+       addq    %rcx,%r12
+       xorq    %rbx,%rdi
+
+       rorq    $6,%r14
+       xorq    %r11,%r13
+       addq    %rdi,%r12
+
+       movq    %rdx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rdx,%r14
+
+       xorq    %r8,%rdi
+       rorq    $14,%r13
+       movq    %r8,%rcx
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rcx
+       addq    %r12,%r10
+       addq    %r12,%rcx
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%rcx
+       movq    48(%rsi),%r12
+       movq    %r10,%r13
+       movq    %rcx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r11,%r15
+
+       xorq    %r10,%r13
+       rorq    $5,%r14
+       xorq    %rax,%r15
+
+       movq    %r12,48(%rsp)
+       xorq    %rcx,%r14
+       andq    %r10,%r15
+
+       rorq    $4,%r13
+       addq    %rbx,%r12
+       xorq    %rax,%r15
+
+       rorq    $6,%r14
+       xorq    %r10,%r13
+       addq    %r15,%r12
+
+       movq    %rcx,%r15
+       addq    (%rbp),%r12
+       xorq    %rcx,%r14
+
+       xorq    %rdx,%r15
+       rorq    $14,%r13
+       movq    %rdx,%rbx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rbx
+       addq    %r12,%r9
+       addq    %r12,%rbx
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%rbx
+       movq    56(%rsi),%r12
+       movq    %r9,%r13
+       movq    %rbx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r10,%rdi
+
+       xorq    %r9,%r13
+       rorq    $5,%r14
+       xorq    %r11,%rdi
+
+       movq    %r12,56(%rsp)
+       xorq    %rbx,%r14
+       andq    %r9,%rdi
+
+       rorq    $4,%r13
+       addq    %rax,%r12
+       xorq    %r11,%rdi
+
+       rorq    $6,%r14
+       xorq    %r9,%r13
+       addq    %rdi,%r12
+
+       movq    %rbx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rbx,%r14
+
+       xorq    %rcx,%rdi
+       rorq    $14,%r13
+       movq    %rcx,%rax
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rax
+       addq    %r12,%r8
+       addq    %r12,%rax
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%rax
+       movq    64(%rsi),%r12
+       movq    %r8,%r13
+       movq    %rax,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r9,%r15
+
+       xorq    %r8,%r13
+       rorq    $5,%r14
+       xorq    %r10,%r15
+
+       movq    %r12,64(%rsp)
+       xorq    %rax,%r14
+       andq    %r8,%r15
+
+       rorq    $4,%r13
+       addq    %r11,%r12
+       xorq    %r10,%r15
+
+       rorq    $6,%r14
+       xorq    %r8,%r13
+       addq    %r15,%r12
+
+       movq    %rax,%r15
+       addq    (%rbp),%r12
+       xorq    %rax,%r14
+
+       xorq    %rbx,%r15
+       rorq    $14,%r13
+       movq    %rbx,%r11
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r11
+       addq    %r12,%rdx
+       addq    %r12,%r11
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%r11
+       movq    72(%rsi),%r12
+       movq    %rdx,%r13
+       movq    %r11,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r8,%rdi
+
+       xorq    %rdx,%r13
+       rorq    $5,%r14
+       xorq    %r9,%rdi
+
+       movq    %r12,72(%rsp)
+       xorq    %r11,%r14
+       andq    %rdx,%rdi
+
+       rorq    $4,%r13
+       addq    %r10,%r12
+       xorq    %r9,%rdi
+
+       rorq    $6,%r14
+       xorq    %rdx,%r13
+       addq    %rdi,%r12
+
+       movq    %r11,%rdi
+       addq    (%rbp),%r12
+       xorq    %r11,%r14
+
+       xorq    %rax,%rdi
+       rorq    $14,%r13
+       movq    %rax,%r10
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r10
+       addq    %r12,%rcx
+       addq    %r12,%r10
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%r10
+       movq    80(%rsi),%r12
+       movq    %rcx,%r13
+       movq    %r10,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rdx,%r15
+
+       xorq    %rcx,%r13
+       rorq    $5,%r14
+       xorq    %r8,%r15
+
+       movq    %r12,80(%rsp)
+       xorq    %r10,%r14
+       andq    %rcx,%r15
+
+       rorq    $4,%r13
+       addq    %r9,%r12
+       xorq    %r8,%r15
+
+       rorq    $6,%r14
+       xorq    %rcx,%r13
+       addq    %r15,%r12
+
+       movq    %r10,%r15
+       addq    (%rbp),%r12
+       xorq    %r10,%r14
+
+       xorq    %r11,%r15
+       rorq    $14,%r13
+       movq    %r11,%r9
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r9
+       addq    %r12,%rbx
+       addq    %r12,%r9
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%r9
+       movq    88(%rsi),%r12
+       movq    %rbx,%r13
+       movq    %r9,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rcx,%rdi
+
+       xorq    %rbx,%r13
+       rorq    $5,%r14
+       xorq    %rdx,%rdi
+
+       movq    %r12,88(%rsp)
+       xorq    %r9,%r14
+       andq    %rbx,%rdi
+
+       rorq    $4,%r13
+       addq    %r8,%r12
+       xorq    %rdx,%rdi
+
+       rorq    $6,%r14
+       xorq    %rbx,%r13
+       addq    %rdi,%r12
+
+       movq    %r9,%rdi
+       addq    (%rbp),%r12
+       xorq    %r9,%r14
+
+       xorq    %r10,%rdi
+       rorq    $14,%r13
+       movq    %r10,%r8
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r8
+       addq    %r12,%rax
+       addq    %r12,%r8
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%r8
+       movq    96(%rsi),%r12
+       movq    %rax,%r13
+       movq    %r8,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rbx,%r15
+
+       xorq    %rax,%r13
+       rorq    $5,%r14
+       xorq    %rcx,%r15
+
+       movq    %r12,96(%rsp)
+       xorq    %r8,%r14
+       andq    %rax,%r15
+
+       rorq    $4,%r13
+       addq    %rdx,%r12
+       xorq    %rcx,%r15
+
+       rorq    $6,%r14
+       xorq    %rax,%r13
+       addq    %r15,%r12
+
+       movq    %r8,%r15
+       addq    (%rbp),%r12
+       xorq    %r8,%r14
+
+       xorq    %r9,%r15
+       rorq    $14,%r13
+       movq    %r9,%rdx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rdx
+       addq    %r12,%r11
+       addq    %r12,%rdx
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%rdx
+       movq    104(%rsi),%r12
+       movq    %r11,%r13
+       movq    %rdx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rax,%rdi
+
+       xorq    %r11,%r13
+       rorq    $5,%r14
+       xorq    %rbx,%rdi
+
+       movq    %r12,104(%rsp)
+       xorq    %rdx,%r14
+       andq    %r11,%rdi
+
+       rorq    $4,%r13
+       addq    %rcx,%r12
+       xorq    %rbx,%rdi
+
+       rorq    $6,%r14
+       xorq    %r11,%r13
+       addq    %rdi,%r12
+
+       movq    %rdx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rdx,%r14
+
+       xorq    %r8,%rdi
+       rorq    $14,%r13
+       movq    %r8,%rcx
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rcx
+       addq    %r12,%r10
+       addq    %r12,%rcx
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%rcx
+       movq    112(%rsi),%r12
+       movq    %r10,%r13
+       movq    %rcx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r11,%r15
+
+       xorq    %r10,%r13
+       rorq    $5,%r14
+       xorq    %rax,%r15
+
+       movq    %r12,112(%rsp)
+       xorq    %rcx,%r14
+       andq    %r10,%r15
+
+       rorq    $4,%r13
+       addq    %rbx,%r12
+       xorq    %rax,%r15
+
+       rorq    $6,%r14
+       xorq    %r10,%r13
+       addq    %r15,%r12
+
+       movq    %rcx,%r15
+       addq    (%rbp),%r12
+       xorq    %rcx,%r14
+
+       xorq    %rdx,%r15
+       rorq    $14,%r13
+       movq    %rdx,%rbx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rbx
+       addq    %r12,%r9
+       addq    %r12,%rbx
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%rbx
+       movq    120(%rsi),%r12
+       movq    %r9,%r13
+       movq    %rbx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r10,%rdi
+
+       xorq    %r9,%r13
+       rorq    $5,%r14
+       xorq    %r11,%rdi
+
+       movq    %r12,120(%rsp)
+       xorq    %rbx,%r14
+       andq    %r9,%rdi
+
+       rorq    $4,%r13
+       addq    %rax,%r12
+       xorq    %r11,%rdi
+
+       rorq    $6,%r14
+       xorq    %r9,%r13
+       addq    %rdi,%r12
+
+       movq    %rbx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rbx,%r14
+
+       xorq    %rcx,%rdi
+       rorq    $14,%r13
+       movq    %rcx,%rax
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rax
+       addq    %r12,%r8
+       addq    %r12,%rax
+
+       leaq    24(%rbp),%rbp
        jmp     .Lrounds_16_xx
 .p2align       4
 .Lrounds_16_xx:
+       movq    8(%rsp),%r13
+       movq    112(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rax
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    72(%rsp),%r12
+
+       addq    0(%rsp),%r12
+       movq    %r8,%r13
+       addq    %r15,%r12
+       movq    %rax,%r14
+       rorq    $23,%r13
+       movq    %r9,%r15
+
+       xorq    %r8,%r13
+       rorq    $5,%r14
+       xorq    %r10,%r15
+
+       movq    %r12,0(%rsp)
+       xorq    %rax,%r14
+       andq    %r8,%r15
+
+       rorq    $4,%r13
+       addq    %r11,%r12
+       xorq    %r10,%r15
+
+       rorq    $6,%r14
+       xorq    %r8,%r13
+       addq    %r15,%r12
+
+       movq    %rax,%r15
+       addq    (%rbp),%r12
+       xorq    %rax,%r14
+
+       xorq    %rbx,%r15
+       rorq    $14,%r13
+       movq    %rbx,%r11
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r11
+       addq    %r12,%rdx
+       addq    %r12,%r11
+
+       leaq    8(%rbp),%rbp
+       movq    16(%rsp),%r13
+       movq    120(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r11
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    80(%rsp),%r12
+
+       addq    8(%rsp),%r12
+       movq    %rdx,%r13
+       addq    %rdi,%r12
+       movq    %r11,%r14
+       rorq    $23,%r13
+       movq    %r8,%rdi
+
+       xorq    %rdx,%r13
+       rorq    $5,%r14
+       xorq    %r9,%rdi
+
+       movq    %r12,8(%rsp)
+       xorq    %r11,%r14
+       andq    %rdx,%rdi
+
+       rorq    $4,%r13
+       addq    %r10,%r12
+       xorq    %r9,%rdi
+
+       rorq    $6,%r14
+       xorq    %rdx,%r13
+       addq    %rdi,%r12
+
+       movq    %r11,%rdi
+       addq    (%rbp),%r12
+       xorq    %r11,%r14
+
+       xorq    %rax,%rdi
+       rorq    $14,%r13
+       movq    %rax,%r10
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r10
+       addq    %r12,%rcx
+       addq    %r12,%r10
+
+       leaq    24(%rbp),%rbp
+       movq    24(%rsp),%r13
+       movq    0(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r10
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    88(%rsp),%r12
+
+       addq    16(%rsp),%r12
+       movq    %rcx,%r13
+       addq    %r15,%r12
+       movq    %r10,%r14
+       rorq    $23,%r13
+       movq    %rdx,%r15
+
+       xorq    %rcx,%r13
+       rorq    $5,%r14
+       xorq    %r8,%r15
+
+       movq    %r12,16(%rsp)
+       xorq    %r10,%r14
+       andq    %rcx,%r15
+
+       rorq    $4,%r13
+       addq    %r9,%r12
+       xorq    %r8,%r15
+
+       rorq    $6,%r14
+       xorq    %rcx,%r13
+       addq    %r15,%r12
+
+       movq    %r10,%r15
+       addq    (%rbp),%r12
+       xorq    %r10,%r14
+
+       xorq    %r11,%r15
+       rorq    $14,%r13
+       movq    %r11,%r9
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r9
+       addq    %r12,%rbx
+       addq    %r12,%r9
+
+       leaq    8(%rbp),%rbp
+       movq    32(%rsp),%r13
+       movq    8(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r9
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    96(%rsp),%r12
+
+       addq    24(%rsp),%r12
+       movq    %rbx,%r13
+       addq    %rdi,%r12
+       movq    %r9,%r14
+       rorq    $23,%r13
+       movq    %rcx,%rdi
+
+       xorq    %rbx,%r13
+       rorq    $5,%r14
+       xorq    %rdx,%rdi
+
+       movq    %r12,24(%rsp)
+       xorq    %r9,%r14
+       andq    %rbx,%rdi
+
+       rorq    $4,%r13
+       addq    %r8,%r12
+       xorq    %rdx,%rdi
+
+       rorq    $6,%r14
+       xorq    %rbx,%r13
+       addq    %rdi,%r12
+
+       movq    %r9,%rdi
+       addq    (%rbp),%r12
+       xorq    %r9,%r14
+
+       xorq    %r10,%rdi
+       rorq    $14,%r13
+       movq    %r10,%r8
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r8
+       addq    %r12,%rax
+       addq    %r12,%r8
+
+       leaq    24(%rbp),%rbp
+       movq    40(%rsp),%r13
+       movq    16(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r8
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    104(%rsp),%r12
+
+       addq    32(%rsp),%r12
+       movq    %rax,%r13
+       addq    %r15,%r12
+       movq    %r8,%r14
+       rorq    $23,%r13
+       movq    %rbx,%r15
+
+       xorq    %rax,%r13
+       rorq    $5,%r14
+       xorq    %rcx,%r15
+
+       movq    %r12,32(%rsp)
+       xorq    %r8,%r14
+       andq    %rax,%r15
+
+       rorq    $4,%r13
+       addq    %rdx,%r12
+       xorq    %rcx,%r15
+
+       rorq    $6,%r14
+       xorq    %rax,%r13
+       addq    %r15,%r12
+
+       movq    %r8,%r15
+       addq    (%rbp),%r12
+       xorq    %r8,%r14
+
+       xorq    %r9,%r15
+       rorq    $14,%r13
+       movq    %r9,%rdx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rdx
+       addq    %r12,%r11
+       addq    %r12,%rdx
+
+       leaq    8(%rbp),%rbp
+       movq    48(%rsp),%r13
+       movq    24(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rdx
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    112(%rsp),%r12
+
+       addq    40(%rsp),%r12
+       movq    %r11,%r13
+       addq    %rdi,%r12
+       movq    %rdx,%r14
+       rorq    $23,%r13
+       movq    %rax,%rdi
+
+       xorq    %r11,%r13
+       rorq    $5,%r14
+       xorq    %rbx,%rdi
+
+       movq    %r12,40(%rsp)
+       xorq    %rdx,%r14
+       andq    %r11,%rdi
+
+       rorq    $4,%r13
+       addq    %rcx,%r12
+       xorq    %rbx,%rdi
+
+       rorq    $6,%r14
+       xorq    %r11,%r13
+       addq    %rdi,%r12
+
+       movq    %rdx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rdx,%r14
+
+       xorq    %r8,%rdi
+       rorq    $14,%r13
+       movq    %r8,%rcx
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rcx
+       addq    %r12,%r10
+       addq    %r12,%rcx
+
+       leaq    24(%rbp),%rbp
+       movq    56(%rsp),%r13
+       movq    32(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rcx
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    120(%rsp),%r12
+
+       addq    48(%rsp),%r12
+       movq    %r10,%r13
+       addq    %r15,%r12
+       movq    %rcx,%r14
+       rorq    $23,%r13
+       movq    %r11,%r15
+
+       xorq    %r10,%r13
+       rorq    $5,%r14
+       xorq    %rax,%r15
+
+       movq    %r12,48(%rsp)
+       xorq    %rcx,%r14
+       andq    %r10,%r15
+
+       rorq    $4,%r13
+       addq    %rbx,%r12
+       xorq    %rax,%r15
+
+       rorq    $6,%r14
+       xorq    %r10,%r13
+       addq    %r15,%r12
+
+       movq    %rcx,%r15
+       addq    (%rbp),%r12
+       xorq    %rcx,%r14
+
+       xorq    %rdx,%r15
+       rorq    $14,%r13
+       movq    %rdx,%rbx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rbx
+       addq    %r12,%r9
+       addq    %r12,%rbx
+
+       leaq    8(%rbp),%rbp
+       movq    64(%rsp),%r13
+       movq    40(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rbx
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    0(%rsp),%r12
+
+       addq    56(%rsp),%r12
+       movq    %r9,%r13
+       addq    %rdi,%r12
+       movq    %rbx,%r14
+       rorq    $23,%r13
+       movq    %r10,%rdi
+
+       xorq    %r9,%r13
+       rorq    $5,%r14
+       xorq    %r11,%rdi
+
+       movq    %r12,56(%rsp)
+       xorq    %rbx,%r14
+       andq    %r9,%rdi
+
+       rorq    $4,%r13
+       addq    %rax,%r12
+       xorq    %r11,%rdi
+
+       rorq    $6,%r14
+       xorq    %r9,%r13
+       addq    %rdi,%r12
+
+       movq    %rbx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rbx,%r14
+
+       xorq    %rcx,%rdi
+       rorq    $14,%r13
+       movq    %rcx,%rax
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rax
+       addq    %r12,%r8
+       addq    %r12,%rax
+
+       leaq    24(%rbp),%rbp
+       movq    72(%rsp),%r13
+       movq    48(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rax
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    8(%rsp),%r12
+
+       addq    64(%rsp),%r12
+       movq    %r8,%r13
+       addq    %r15,%r12
+       movq    %rax,%r14
+       rorq    $23,%r13
+       movq    %r9,%r15
+
+       xorq    %r8,%r13
+       rorq    $5,%r14
+       xorq    %r10,%r15
+
+       movq    %r12,64(%rsp)
+       xorq    %rax,%r14
+       andq    %r8,%r15
+
+       rorq    $4,%r13
+       addq    %r11,%r12
+       xorq    %r10,%r15
+
+       rorq    $6,%r14
+       xorq    %r8,%r13
+       addq    %r15,%r12
+
+       movq    %rax,%r15
+       addq    (%rbp),%r12
+       xorq    %rax,%r14
+
+       xorq    %rbx,%r15
+       rorq    $14,%r13
+       movq    %rbx,%r11
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r11
+       addq    %r12,%rdx
+       addq    %r12,%r11
+
+       leaq    8(%rbp),%rbp
+       movq    80(%rsp),%r13
+       movq    56(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r11
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    16(%rsp),%r12
+
+       addq    72(%rsp),%r12
+       movq    %rdx,%r13
+       addq    %rdi,%r12
+       movq    %r11,%r14
+       rorq    $23,%r13
+       movq    %r8,%rdi
+
+       xorq    %rdx,%r13
+       rorq    $5,%r14
+       xorq    %r9,%rdi
+
+       movq    %r12,72(%rsp)
+       xorq    %r11,%r14
+       andq    %rdx,%rdi
+
+       rorq    $4,%r13
+       addq    %r10,%r12
+       xorq    %r9,%rdi
+
+       rorq    $6,%r14
+       xorq    %rdx,%r13
+       addq    %rdi,%r12
+
+       movq    %r11,%rdi
+       addq    (%rbp),%r12
+       xorq    %r11,%r14
+
+       xorq    %rax,%rdi
+       rorq    $14,%r13
+       movq    %rax,%r10
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r10
+       addq    %r12,%rcx
+       addq    %r12,%r10
+
+       leaq    24(%rbp),%rbp
+       movq    88(%rsp),%r13
+       movq    64(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r10
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    24(%rsp),%r12
+
+       addq    80(%rsp),%r12
+       movq    %rcx,%r13
+       addq    %r15,%r12
+       movq    %r10,%r14
+       rorq    $23,%r13
+       movq    %rdx,%r15
+
+       xorq    %rcx,%r13
+       rorq    $5,%r14
+       xorq    %r8,%r15
+
+       movq    %r12,80(%rsp)
+       xorq    %r10,%r14
+       andq    %rcx,%r15
+
+       rorq    $4,%r13
+       addq    %r9,%r12
+       xorq    %r8,%r15
+
+       rorq    $6,%r14
+       xorq    %rcx,%r13
+       addq    %r15,%r12
+
+       movq    %r10,%r15
+       addq    (%rbp),%r12
+       xorq    %r10,%r14
+
+       xorq    %r11,%r15
+       rorq    $14,%r13
+       movq    %r11,%r9
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r9
+       addq    %r12,%rbx
+       addq    %r12,%r9
+
+       leaq    8(%rbp),%rbp
+       movq    96(%rsp),%r13
+       movq    72(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r9
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    32(%rsp),%r12
+
+       addq    88(%rsp),%r12
+       movq    %rbx,%r13
+       addq    %rdi,%r12
+       movq    %r9,%r14
+       rorq    $23,%r13
+       movq    %rcx,%rdi
 
-       movl    56(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    36(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    0(%rsp),%r12d
-       movl    %r8d,%r13d
-       addl    %r14d,%r12d
-       movl    %eax,%r14d
-       rorl    $14,%r13d
-       movl    %r9d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r8d,%r13d
-       xorl    %r10d,%r15d
-
-       movl    %r12d,0(%rsp)
-       xorl    %eax,%r14d
-       andl    %r8d,%r15d
-
-       rorl    $5,%r13d
-       addl    %r11d,%r12d
-       xorl    %r10d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r8d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %eax,%r15d
-       addl    (%rbp),%r12d
-       xorl    %eax,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ebx,%r15d
-       movl    %ebx,%r11d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r11d
-       addl    %r12d,%edx
-       addl    %r12d,%r11d
-       movl    8(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r11d
-
-
-       movl    60(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    40(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    4(%rsp),%r12d
-       movl    %edx,%r13d
-       addl    %r14d,%r12d
-       movl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r8d,%edi
-
-       rorl    $9,%r14d
-       xorl    %edx,%r13d
-       xorl    %r9d,%edi
-
-       movl    %r12d,4(%rsp)
-       xorl    %r11d,%r14d
-       andl    %edx,%edi
-
-       rorl    $5,%r13d
-       addl    %r10d,%r12d
-       xorl    %r9d,%edi
-
-       rorl    $11,%r14d
-       xorl    %edx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r11d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r11d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %eax,%edi
-       movl    %eax,%r10d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r10d
-       addl    %r12d,%ecx
-       addl    %r12d,%r10d
-       movl    12(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r10d
-
-
-       movl    0(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    44(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    8(%rsp),%r12d
-       movl    %ecx,%r13d
-       addl    %r14d,%r12d
-       movl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %edx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %ecx,%r13d
-       xorl    %r8d,%r15d
-
-       movl    %r12d,8(%rsp)
-       xorl    %r10d,%r14d
-       andl    %ecx,%r15d
-
-       rorl    $5,%r13d
-       addl    %r9d,%r12d
-       xorl    %r8d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %ecx,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r10d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r10d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r11d,%r15d
-       movl    %r11d,%r9d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r9d
-       addl    %r12d,%ebx
-       addl    %r12d,%r9d
-       movl    16(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r9d
-
-
-       movl    4(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    48(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    12(%rsp),%r12d
-       movl    %ebx,%r13d
-       addl    %r14d,%r12d
-       movl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %ecx,%edi
-
-       rorl    $9,%r14d
-       xorl    %ebx,%r13d
-       xorl    %edx,%edi
-
-       movl    %r12d,12(%rsp)
-       xorl    %r9d,%r14d
-       andl    %ebx,%edi
-
-       rorl    $5,%r13d
-       addl    %r8d,%r12d
-       xorl    %edx,%edi
-
-       rorl    $11,%r14d
-       xorl    %ebx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r9d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r9d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r10d,%edi
-       movl    %r10d,%r8d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r8d
-       addl    %r12d,%eax
-       addl    %r12d,%r8d
-       movl    20(%rsp),%r13d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%r8d
-
-
-       movl    8(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    52(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    16(%rsp),%r12d
-       movl    %eax,%r13d
-       addl    %r14d,%r12d
-       movl    %r8d,%r14d
-       rorl    $14,%r13d
-       movl    %ebx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %eax,%r13d
-       xorl    %ecx,%r15d
-
-       movl    %r12d,16(%rsp)
-       xorl    %r8d,%r14d
-       andl    %eax,%r15d
-
-       rorl    $5,%r13d
-       addl    %edx,%r12d
-       xorl    %ecx,%r15d
-
-       rorl    $11,%r14d
-       xorl    %eax,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r8d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r8d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r9d,%r15d
-       movl    %r9d,%edx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%edx
-       addl    %r12d,%r11d
-       addl    %r12d,%edx
-       movl    24(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%edx
-
-
-       movl    12(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    56(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    20(%rsp),%r12d
-       movl    %r11d,%r13d
-       addl    %r14d,%r12d
-       movl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %eax,%edi
-
-       rorl    $9,%r14d
-       xorl    %r11d,%r13d
-       xorl    %ebx,%edi
-
-       movl    %r12d,20(%rsp)
-       xorl    %edx,%r14d
-       andl    %r11d,%edi
-
-       rorl    $5,%r13d
-       addl    %ecx,%r12d
-       xorl    %ebx,%edi
-
-       rorl    $11,%r14d
-       xorl    %r11d,%r13d
-       addl    %edi,%r12d
-
-       movl    %edx,%edi
-       addl    (%rbp),%r12d
-       xorl    %edx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r8d,%edi
-       movl    %r8d,%ecx
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%ecx
-       addl    %r12d,%r10d
-       addl    %r12d,%ecx
-       movl    28(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ecx
-
-
-       movl    16(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    60(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    24(%rsp),%r12d
-       movl    %r10d,%r13d
-       addl    %r14d,%r12d
-       movl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r11d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r10d,%r13d
-       xorl    %eax,%r15d
-
-       movl    %r12d,24(%rsp)
-       xorl    %ecx,%r14d
-       andl    %r10d,%r15d
-
-       rorl    $5,%r13d
-       addl    %ebx,%r12d
-       xorl    %eax,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r10d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %ecx,%r15d
-       addl    (%rbp),%r12d
-       xorl    %ecx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %edx,%r15d
-       movl    %edx,%ebx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%ebx
-       addl    %r12d,%r9d
-       addl    %r12d,%ebx
-       movl    32(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ebx
-
-
-       movl    20(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    0(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    28(%rsp),%r12d
-       movl    %r9d,%r13d
-       addl    %r14d,%r12d
-       movl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r10d,%edi
-
-       rorl    $9,%r14d
-       xorl    %r9d,%r13d
-       xorl    %r11d,%edi
-
-       movl    %r12d,28(%rsp)
-       xorl    %ebx,%r14d
-       andl    %r9d,%edi
-
-       rorl    $5,%r13d
-       addl    %eax,%r12d
-       xorl    %r11d,%edi
-
-       rorl    $11,%r14d
-       xorl    %r9d,%r13d
-       addl    %edi,%r12d
-
-       movl    %ebx,%edi
-       addl    (%rbp),%r12d
-       xorl    %ebx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ecx,%edi
-       movl    %ecx,%eax
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%eax
-       addl    %r12d,%r8d
-       addl    %r12d,%eax
-       movl    36(%rsp),%r13d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%eax
-
-
-       movl    24(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    4(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    32(%rsp),%r12d
-       movl    %r8d,%r13d
-       addl    %r14d,%r12d
-       movl    %eax,%r14d
-       rorl    $14,%r13d
-       movl    %r9d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r8d,%r13d
-       xorl    %r10d,%r15d
-
-       movl    %r12d,32(%rsp)
-       xorl    %eax,%r14d
-       andl    %r8d,%r15d
-
-       rorl    $5,%r13d
-       addl    %r11d,%r12d
-       xorl    %r10d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r8d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %eax,%r15d
-       addl    (%rbp),%r12d
-       xorl    %eax,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ebx,%r15d
-       movl    %ebx,%r11d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r11d
-       addl    %r12d,%edx
-       addl    %r12d,%r11d
-       movl    40(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r11d
-
-
-       movl    28(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    8(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    36(%rsp),%r12d
-       movl    %edx,%r13d
-       addl    %r14d,%r12d
-       movl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r8d,%edi
-
-       rorl    $9,%r14d
-       xorl    %edx,%r13d
-       xorl    %r9d,%edi
-
-       movl    %r12d,36(%rsp)
-       xorl    %r11d,%r14d
-       andl    %edx,%edi
-
-       rorl    $5,%r13d
-       addl    %r10d,%r12d
-       xorl    %r9d,%edi
-
-       rorl    $11,%r14d
-       xorl    %edx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r11d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r11d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %eax,%edi
-       movl    %eax,%r10d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r10d
-       addl    %r12d,%ecx
-       addl    %r12d,%r10d
-       movl    44(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r10d
-
-
-       movl    32(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    12(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    40(%rsp),%r12d
-       movl    %ecx,%r13d
-       addl    %r14d,%r12d
-       movl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %edx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %ecx,%r13d
-       xorl    %r8d,%r15d
-
-       movl    %r12d,40(%rsp)
-       xorl    %r10d,%r14d
-       andl    %ecx,%r15d
-
-       rorl    $5,%r13d
-       addl    %r9d,%r12d
-       xorl    %r8d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %ecx,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r10d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r10d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r11d,%r15d
-       movl    %r11d,%r9d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r9d
-       addl    %r12d,%ebx
-       addl    %r12d,%r9d
-       movl    48(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r9d
-
-
-       movl    36(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    16(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    44(%rsp),%r12d
-       movl    %ebx,%r13d
-       addl    %r14d,%r12d
-       movl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %ecx,%edi
-
-       rorl    $9,%r14d
-       xorl    %ebx,%r13d
-       xorl    %edx,%edi
-
-       movl    %r12d,44(%rsp)
-       xorl    %r9d,%r14d
-       andl    %ebx,%edi
-
-       rorl    $5,%r13d
-       addl    %r8d,%r12d
-       xorl    %edx,%edi
-
-       rorl    $11,%r14d
-       xorl    %ebx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r9d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r9d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r10d,%edi
-       movl    %r10d,%r8d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r8d
-       addl    %r12d,%eax
-       addl    %r12d,%r8d
-       movl    52(%rsp),%r13d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%r8d
-
-
-       movl    40(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    20(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    48(%rsp),%r12d
-       movl    %eax,%r13d
-       addl    %r14d,%r12d
-       movl    %r8d,%r14d
-       rorl    $14,%r13d
-       movl    %ebx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %eax,%r13d
-       xorl    %ecx,%r15d
-
-       movl    %r12d,48(%rsp)
-       xorl    %r8d,%r14d
-       andl    %eax,%r15d
-
-       rorl    $5,%r13d
-       addl    %edx,%r12d
-       xorl    %ecx,%r15d
-
-       rorl    $11,%r14d
-       xorl    %eax,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r8d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r8d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r9d,%r15d
-       movl    %r9d,%edx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%edx
-       addl    %r12d,%r11d
-       addl    %r12d,%edx
-       movl    56(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%edx
-
-
-       movl    44(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    24(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    52(%rsp),%r12d
-       movl    %r11d,%r13d
-       addl    %r14d,%r12d
-       movl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %eax,%edi
-
-       rorl    $9,%r14d
-       xorl    %r11d,%r13d
-       xorl    %ebx,%edi
-
-       movl    %r12d,52(%rsp)
-       xorl    %edx,%r14d
-       andl    %r11d,%edi
-
-       rorl    $5,%r13d
-       addl    %ecx,%r12d
-       xorl    %ebx,%edi
-
-       rorl    $11,%r14d
-       xorl    %r11d,%r13d
-       addl    %edi,%r12d
-
-       movl    %edx,%edi
-       addl    (%rbp),%r12d
-       xorl    %edx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r8d,%edi
-       movl    %r8d,%ecx
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%ecx
-       addl    %r12d,%r10d
-       addl    %r12d,%ecx
-       movl    60(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ecx
-
-
-       movl    48(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    28(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    56(%rsp),%r12d
-       movl    %r10d,%r13d
-       addl    %r14d,%r12d
-       movl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r11d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r10d,%r13d
-       xorl    %eax,%r15d
-
-       movl    %r12d,56(%rsp)
-       xorl    %ecx,%r14d
-       andl    %r10d,%r15d
-
-       rorl    $5,%r13d
-       addl    %ebx,%r12d
-       xorl    %eax,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r10d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %ecx,%r15d
-       addl    (%rbp),%r12d
-       xorl    %ecx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %edx,%r15d
-       movl    %edx,%ebx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%ebx
-       addl    %r12d,%r9d
-       addl    %r12d,%ebx
-       movl    0(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ebx
-
-
-       movl    52(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    32(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    60(%rsp),%r12d
-       movl    %r9d,%r13d
-       addl    %r14d,%r12d
-       movl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r10d,%edi
-
-       rorl    $9,%r14d
-       xorl    %r9d,%r13d
-       xorl    %r11d,%edi
-
-       movl    %r12d,60(%rsp)
-       xorl    %ebx,%r14d
-       andl    %r9d,%edi
-
-       rorl    $5,%r13d
-       addl    %eax,%r12d
-       xorl    %r11d,%edi
-
-       rorl    $11,%r14d
-       xorl    %r9d,%r13d
-       addl    %edi,%r12d
-
-       movl    %ebx,%edi
-       addl    (%rbp),%r12d
-       xorl    %ebx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ecx,%edi
-       movl    %ecx,%eax
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%eax
-       addl    %r12d,%r8d
-       addl    %r12d,%eax
-       movl    4(%rsp),%r13d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%eax
-
-       cmpb    $0,3(%rbp)
+       xorq    %rbx,%r13
+       rorq    $5,%r14
+       xorq    %rdx,%rdi
+
+       movq    %r12,88(%rsp)
+       xorq    %r9,%r14
+       andq    %rbx,%rdi
+
+       rorq    $4,%r13
+       addq    %r8,%r12
+       xorq    %rdx,%rdi
+
+       rorq    $6,%r14
+       xorq    %rbx,%r13
+       addq    %rdi,%r12
+
+       movq    %r9,%rdi
+       addq    (%rbp),%r12
+       xorq    %r9,%r14
+
+       xorq    %r10,%rdi
+       rorq    $14,%r13
+       movq    %r10,%r8
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r8
+       addq    %r12,%rax
+       addq    %r12,%r8
+
+       leaq    24(%rbp),%rbp
+       movq    104(%rsp),%r13
+       movq    80(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r8
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    40(%rsp),%r12
+
+       addq    96(%rsp),%r12
+       movq    %rax,%r13
+       addq    %r15,%r12
+       movq    %r8,%r14
+       rorq    $23,%r13
+       movq    %rbx,%r15
+
+       xorq    %rax,%r13
+       rorq    $5,%r14
+       xorq    %rcx,%r15
+
+       movq    %r12,96(%rsp)
+       xorq    %r8,%r14
+       andq    %rax,%r15
+
+       rorq    $4,%r13
+       addq    %rdx,%r12
+       xorq    %rcx,%r15
+
+       rorq    $6,%r14
+       xorq    %rax,%r13
+       addq    %r15,%r12
+
+       movq    %r8,%r15
+       addq    (%rbp),%r12
+       xorq    %r8,%r14
+
+       xorq    %r9,%r15
+       rorq    $14,%r13
+       movq    %r9,%rdx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rdx
+       addq    %r12,%r11
+       addq    %r12,%rdx
+
+       leaq    8(%rbp),%rbp
+       movq    112(%rsp),%r13
+       movq    88(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rdx
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    48(%rsp),%r12
+
+       addq    104(%rsp),%r12
+       movq    %r11,%r13
+       addq    %rdi,%r12
+       movq    %rdx,%r14
+       rorq    $23,%r13
+       movq    %rax,%rdi
+
+       xorq    %r11,%r13
+       rorq    $5,%r14
+       xorq    %rbx,%rdi
+
+       movq    %r12,104(%rsp)
+       xorq    %rdx,%r14
+       andq    %r11,%rdi
+
+       rorq    $4,%r13
+       addq    %rcx,%r12
+       xorq    %rbx,%rdi
+
+       rorq    $6,%r14
+       xorq    %r11,%r13
+       addq    %rdi,%r12
+
+       movq    %rdx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rdx,%r14
+
+       xorq    %r8,%rdi
+       rorq    $14,%r13
+       movq    %r8,%rcx
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rcx
+       addq    %r12,%r10
+       addq    %r12,%rcx
+
+       leaq    24(%rbp),%rbp
+       movq    120(%rsp),%r13
+       movq    96(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rcx
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    56(%rsp),%r12
+
+       addq    112(%rsp),%r12
+       movq    %r10,%r13
+       addq    %r15,%r12
+       movq    %rcx,%r14
+       rorq    $23,%r13
+       movq    %r11,%r15
+
+       xorq    %r10,%r13
+       rorq    $5,%r14
+       xorq    %rax,%r15
+
+       movq    %r12,112(%rsp)
+       xorq    %rcx,%r14
+       andq    %r10,%r15
+
+       rorq    $4,%r13
+       addq    %rbx,%r12
+       xorq    %rax,%r15
+
+       rorq    $6,%r14
+       xorq    %r10,%r13
+       addq    %r15,%r12
+
+       movq    %rcx,%r15
+       addq    (%rbp),%r12
+       xorq    %rcx,%r14
+
+       xorq    %rdx,%r15
+       rorq    $14,%r13
+       movq    %rdx,%rbx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rbx
+       addq    %r12,%r9
+       addq    %r12,%rbx
+
+       leaq    8(%rbp),%rbp
+       movq    0(%rsp),%r13
+       movq    104(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rbx
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    64(%rsp),%r12
+
+       addq    120(%rsp),%r12
+       movq    %r9,%r13
+       addq    %rdi,%r12
+       movq    %rbx,%r14
+       rorq    $23,%r13
+       movq    %r10,%rdi
+
+       xorq    %r9,%r13
+       rorq    $5,%r14
+       xorq    %r11,%rdi
+
+       movq    %r12,120(%rsp)
+       xorq    %rbx,%r14
+       andq    %r9,%rdi
+
+       rorq    $4,%r13
+       addq    %rax,%r12
+       xorq    %r11,%rdi
+
+       rorq    $6,%r14
+       xorq    %r9,%r13
+       addq    %rdi,%r12
+
+       movq    %rbx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rbx,%r14
+
+       xorq    %rcx,%rdi
+       rorq    $14,%r13
+       movq    %rcx,%rax
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rax
+       addq    %r12,%r8
+       addq    %r12,%rax
+
+       leaq    24(%rbp),%rbp
+       cmpb    $0,7(%rbp)
        jnz     .Lrounds_16_xx
 
-       movq    64+0(%rsp),%rdi
-       leaq    64(%rsi),%rsi
-
-       addl    0(%rdi),%eax
-       addl    4(%rdi),%ebx
-       addl    8(%rdi),%ecx
-       addl    12(%rdi),%edx
-       addl    16(%rdi),%r8d
-       addl    20(%rdi),%r9d
-       addl    24(%rdi),%r10d
-       addl    28(%rdi),%r11d
-
-       cmpq    64+16(%rsp),%rsi
-
-       movl    %eax,0(%rdi)
-       movl    %ebx,4(%rdi)
-       movl    %ecx,8(%rdi)
-       movl    %edx,12(%rdi)
-       movl    %r8d,16(%rdi)
-       movl    %r9d,20(%rdi)
-       movl    %r10d,24(%rdi)
-       movl    %r11d,28(%rdi)
+       movq    128+0(%rsp),%rdi
+       addq    %r14,%rax
+       leaq    128(%rsi),%rsi
+
+       addq    0(%rdi),%rax
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       addq    48(%rdi),%r10
+       addq    56(%rdi),%r11
+
+       cmpq    128+16(%rsp),%rsi
+
+       movq    %rax,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
        jb      .Lloop
 
-       movq    64+24(%rsp),%rsi
-       movq    (%rsi),%r15
-       movq    8(%rsi),%r14
-       movq    16(%rsi),%r13
-       movq    24(%rsi),%r12
-       movq    32(%rsi),%rbp
-       movq    40(%rsi),%rbx
-       leaq    48(%rsi),%rsp
+       movq    152(%rsp),%rsi
+
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
 .Lepilogue:
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
-.LSEH_end_sha256_block_data_order:
+
+.LSEH_end_sha512_block_data_order:
 .p2align       6
 
-K256:
-.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
-.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
-.long  0x03020100,0x0b0a0908,0xffffffff,0xffffffff
-.long  0x03020100,0x0b0a0908,0xffffffff,0xffffffff
-.long  0xffffffff,0xffffffff,0x03020100,0x0b0a0908
-.long  0xffffffff,0xffffffff,0x03020100,0x0b0a0908
-.byte  83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.def   sha256_block_data_order_ssse3;  .scl 3; .type 32;       .endef
+K512:
+.quad  0x428a2f98d728ae22,0x7137449123ef65cd
+.quad  0x428a2f98d728ae22,0x7137449123ef65cd
+.quad  0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad  0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad  0x3956c25bf348b538,0x59f111f1b605d019
+.quad  0x3956c25bf348b538,0x59f111f1b605d019
+.quad  0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad  0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad  0xd807aa98a3030242,0x12835b0145706fbe
+.quad  0xd807aa98a3030242,0x12835b0145706fbe
+.quad  0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad  0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad  0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad  0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad  0x9bdc06a725c71235,0xc19bf174cf692694
+.quad  0x9bdc06a725c71235,0xc19bf174cf692694
+.quad  0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad  0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad  0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad  0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad  0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad  0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad  0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad  0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad  0x983e5152ee66dfab,0xa831c66d2db43210
+.quad  0x983e5152ee66dfab,0xa831c66d2db43210
+.quad  0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad  0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad  0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad  0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad  0x06ca6351e003826f,0x142929670a0e6e70
+.quad  0x06ca6351e003826f,0x142929670a0e6e70
+.quad  0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad  0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad  0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad  0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad  0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad  0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad  0x81c2c92e47edaee6,0x92722c851482353b
+.quad  0x81c2c92e47edaee6,0x92722c851482353b
+.quad  0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad  0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad  0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad  0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad  0xd192e819d6ef5218,0xd69906245565a910
+.quad  0xd192e819d6ef5218,0xd69906245565a910
+.quad  0xf40e35855771202a,0x106aa07032bbd1b8
+.quad  0xf40e35855771202a,0x106aa07032bbd1b8
+.quad  0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad  0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad  0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad  0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad  0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad  0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad  0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad  0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad  0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad  0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad  0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad  0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad  0x90befffa23631e28,0xa4506cebde82bde9
+.quad  0x90befffa23631e28,0xa4506cebde82bde9
+.quad  0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad  0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad  0xca273eceea26619c,0xd186b8c721c0c207
+.quad  0xca273eceea26619c,0xd186b8c721c0c207
+.quad  0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad  0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad  0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad  0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad  0x113f9804bef90dae,0x1b710b35131c471b
+.quad  0x113f9804bef90dae,0x1b710b35131c471b
+.quad  0x28db77f523047d84,0x32caab7b40c72493
+.quad  0x28db77f523047d84,0x32caab7b40c72493
+.quad  0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad  0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad  0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad  0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad  0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad  0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.quad  0x0001020304050607,0x08090a0b0c0d0e0f
+.quad  0x0001020304050607,0x08090a0b0c0d0e0f
+.byte  83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.def   sha512_block_data_order_xop;    .scl 3; .type 32;       .endef
 .p2align       6
-sha256_block_data_order_ssse3:
+sha512_block_data_order_xop:
        movq    %rdi,8(%rsp)
        movq    %rsi,16(%rsp)
        movq    %rsp,%rax
-.LSEH_begin_sha256_block_data_order_ssse3:
+.LSEH_begin_sha512_block_data_order_xop:
        movq    %rcx,%rdi
        movq    %rdx,%rsi
        movq    %r8,%rdx
 
-.Lssse3_shortcut:
+
+.Lxop_shortcut:
+       movq    %rsp,%rax
+
        pushq   %rbx
+
        pushq   %rbp
+
        pushq   %r12
+
        pushq   %r13
+
        pushq   %r14
+
        pushq   %r15
-       movq    %rsp,%r11
+
        shlq    $4,%rdx
-       subq    $160,%rsp
-       leaq    (%rsi,%rdx,4),%rdx
+       subq    $256,%rsp
+       leaq    (%rsi,%rdx,8),%rdx
        andq    $-64,%rsp
-       movq    %rdi,64+0(%rsp)
-       movq    %rsi,64+8(%rsp)
-       movq    %rdx,64+16(%rsp)
-       movq    %r11,64+24(%rsp)
-       movaps  %xmm6,64+32(%rsp)
-       movaps  %xmm7,64+48(%rsp)
-       movaps  %xmm8,64+64(%rsp)
-       movaps  %xmm9,64+80(%rsp)
-.Lprologue_ssse3:
-
-       movl    0(%rdi),%eax
-       movl    4(%rdi),%ebx
-       movl    8(%rdi),%ecx
-       movl    12(%rdi),%edx
-       movl    16(%rdi),%r8d
-       movl    20(%rdi),%r9d
-       movl    24(%rdi),%r10d
-       movl    28(%rdi),%r11d
-       movdqa  K256+512+32(%rip),%xmm8
-       movdqa  K256+512+64(%rip),%xmm9
-       jmp     .Lloop_ssse3
+       movq    %rdi,128+0(%rsp)
+       movq    %rsi,128+8(%rsp)
+       movq    %rdx,128+16(%rsp)
+       movq    %rax,152(%rsp)
+
+       movaps  %xmm6,128+32(%rsp)
+       movaps  %xmm7,128+48(%rsp)
+       movaps  %xmm8,128+64(%rsp)
+       movaps  %xmm9,128+80(%rsp)
+       movaps  %xmm10,128+96(%rsp)
+       movaps  %xmm11,128+112(%rsp)
+.Lprologue_xop:
+
+       vzeroupper
+       movq    0(%rdi),%rax
+       movq    8(%rdi),%rbx
+       movq    16(%rdi),%rcx
+       movq    24(%rdi),%rdx
+       movq    32(%rdi),%r8
+       movq    40(%rdi),%r9
+       movq    48(%rdi),%r10
+       movq    56(%rdi),%r11
+       jmp     .Lloop_xop
 .p2align       4
-.Lloop_ssse3:
-       movdqa  K256+512(%rip),%xmm7
-       movdqu  0(%rsi),%xmm0
-       movdqu  16(%rsi),%xmm1
-       movdqu  32(%rsi),%xmm2
-       movdqu  48(%rsi),%xmm3
-.byte  102,15,56,0,199
-       leaq    K256(%rip),%rbp
-.byte  102,15,56,0,207
-       movdqa  0(%rbp),%xmm4
-.byte  102,15,56,0,215
-       movdqa  32(%rbp),%xmm5
-       paddd   %xmm0,%xmm4
-       movdqa  64(%rbp),%xmm6
-.byte  102,15,56,0,223
-       movdqa  96(%rbp),%xmm7
-       paddd   %xmm1,%xmm5
-       paddd   %xmm2,%xmm6
-       paddd   %xmm3,%xmm7
-       movdqa  %xmm4,0(%rsp)
-       movl    %eax,%r14d
-       movdqa  %xmm5,16(%rsp)
-       movl    %ebx,%edi
-       movdqa  %xmm6,32(%rsp)
-       xorl    %ecx,%edi
-       movdqa  %xmm7,48(%rsp)
-       movl    %r8d,%r13d
-       jmp     .Lssse3_00_47
+.Lloop_xop:
+       vmovdqa K512+1280(%rip),%xmm11
+       vmovdqu 0(%rsi),%xmm0
+       leaq    K512+128(%rip),%rbp
+       vmovdqu 16(%rsi),%xmm1
+       vmovdqu 32(%rsi),%xmm2
+       vpshufb %xmm11,%xmm0,%xmm0
+       vmovdqu 48(%rsi),%xmm3
+       vpshufb %xmm11,%xmm1,%xmm1
+       vmovdqu 64(%rsi),%xmm4
+       vpshufb %xmm11,%xmm2,%xmm2
+       vmovdqu 80(%rsi),%xmm5
+       vpshufb %xmm11,%xmm3,%xmm3
+       vmovdqu 96(%rsi),%xmm6
+       vpshufb %xmm11,%xmm4,%xmm4
+       vmovdqu 112(%rsi),%xmm7
+       vpshufb %xmm11,%xmm5,%xmm5
+       vpaddq  -128(%rbp),%xmm0,%xmm8
+       vpshufb %xmm11,%xmm6,%xmm6
+       vpaddq  -96(%rbp),%xmm1,%xmm9
+       vpshufb %xmm11,%xmm7,%xmm7
+       vpaddq  -64(%rbp),%xmm2,%xmm10
+       vpaddq  -32(%rbp),%xmm3,%xmm11
+       vmovdqa %xmm8,0(%rsp)
+       vpaddq  0(%rbp),%xmm4,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       vpaddq  32(%rbp),%xmm5,%xmm9
+       vmovdqa %xmm10,32(%rsp)
+       vpaddq  64(%rbp),%xmm6,%xmm10
+       vmovdqa %xmm11,48(%rsp)
+       vpaddq  96(%rbp),%xmm7,%xmm11
+       vmovdqa %xmm8,64(%rsp)
+       movq    %rax,%r14
+       vmovdqa %xmm9,80(%rsp)
+       movq    %rbx,%rdi
+       vmovdqa %xmm10,96(%rsp)
+       xorq    %rcx,%rdi
+       vmovdqa %xmm11,112(%rsp)
+       movq    %r8,%r13
+       jmp     .Lxop_00_47
 
 .p2align       4
-.Lssse3_00_47:
-       subq    $-32*4,%rbp
-       rorl    $14,%r13d
-       movl    %r14d,%eax
-       movdqa  %xmm1,%xmm4
-       movl    %r9d,%r12d
-       movdqa  %xmm3,%xmm7
-       xorl    %r8d,%r13d
-       rorl    $9,%r14d
-       xorl    %r10d,%r12d
-.byte  102,15,58,15,224,4
-       rorl    $5,%r13d
-       xorl    %eax,%r14d
-.byte  102,15,58,15,250,4
-       andl    %r8d,%r12d
-       xorl    %r8d,%r13d
-       addl    0(%rsp),%r11d
-       movl    %eax,%r15d
-       rorl    $11,%r14d
-       xorl    %r10d,%r12d
-       movdqa  %xmm4,%xmm5
-       xorl    %ebx,%r15d
-       movdqa  %xmm4,%xmm6
-       rorl    $6,%r13d
-       addl    %r12d,%r11d
-       andl    %r15d,%edi
-       psrld   $3,%xmm4
-       xorl    %eax,%r14d
-       addl    %r13d,%r11d
-       xorl    %ebx,%edi
-       paddd   %xmm7,%xmm0
-       addl    %r11d,%edx
-       rorl    $2,%r14d
-       addl    %edi,%r11d
-       psrld   $7,%xmm6
-       movl    %edx,%r13d
-       addl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r11d
-       pshufd  $250,%xmm3,%xmm7
-       movl    %r8d,%r12d
-       pslld   $14,%xmm5
-       xorl    %edx,%r13d
-       pxor    %xmm6,%xmm4
-       rorl    $9,%r14d
-       xorl    %r9d,%r12d
-       psrld   $11,%xmm6
-       rorl    $5,%r13d
-       xorl    %r11d,%r14d
-       pxor    %xmm5,%xmm4
-       andl    %edx,%r12d
-       xorl    %edx,%r13d
-       pslld   $11,%xmm5
-       addl    4(%rsp),%r10d
-       pxor    %xmm6,%xmm4
-       movl    %r11d,%edi
-       rorl    $11,%r14d
-       xorl    %r9d,%r12d
-       movdqa  %xmm7,%xmm6
-       xorl    %eax,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r10d
-       pxor    %xmm5,%xmm4
-       andl    %edi,%r15d
-       xorl    %r11d,%r14d
-       psrld   $10,%xmm7
-       addl    %r13d,%r10d
-       xorl    %eax,%r15d
-       paddd   %xmm4,%xmm0
-       addl    %r10d,%ecx
-       rorl    $2,%r14d
-       addl    %r15d,%r10d
-       movl    %ecx,%r13d
-       psrlq   $17,%xmm6
-       addl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r10d
-       movl    %edx,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %ecx,%r13d
-       rorl    $9,%r14d
-       psrlq   $2,%xmm6
-       xorl    %r8d,%r12d
-       rorl    $5,%r13d
-       xorl    %r10d,%r14d
-       pxor    %xmm6,%xmm7
-       andl    %ecx,%r12d
-       xorl    %ecx,%r13d
-       addl    8(%rsp),%r9d
-.byte  102,65,15,56,0,248
-       movl    %r10d,%r15d
-       rorl    $11,%r14d
-       xorl    %r8d,%r12d
-       xorl    %r11d,%r15d
-       rorl    $6,%r13d
-       paddd   %xmm7,%xmm0
-       addl    %r12d,%r9d
-       pshufd  $80,%xmm0,%xmm7
-       andl    %r15d,%edi
-       xorl    %r10d,%r14d
-       addl    %r13d,%r9d
-       xorl    %r11d,%edi
-       movdqa  %xmm7,%xmm6
-       addl    %r9d,%ebx
-       rorl    $2,%r14d
-       addl    %edi,%r9d
-       psrld   $10,%xmm7
-       movl    %ebx,%r13d
-       psrlq   $17,%xmm6
-       addl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r9d
-       movl    %ecx,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %ebx,%r13d
-       rorl    $9,%r14d
-       xorl    %edx,%r12d
-       psrlq   $2,%xmm6
-       rorl    $5,%r13d
-       xorl    %r9d,%r14d
-       andl    %ebx,%r12d
-       xorl    %ebx,%r13d
-       pxor    %xmm6,%xmm7
-       addl    12(%rsp),%r8d
-       movl    %r9d,%edi
-       movdqa  0(%rbp),%xmm6
-       rorl    $11,%r14d
-       xorl    %edx,%r12d
-.byte  102,65,15,56,0,249
-       xorl    %r10d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r8d
-       andl    %edi,%r15d
-       xorl    %r9d,%r14d
-       paddd   %xmm7,%xmm0
-       addl    %r13d,%r8d
-       xorl    %r10d,%r15d
-       addl    %r8d,%eax
-       paddd   %xmm0,%xmm6
-       rorl    $2,%r14d
-       addl    %r15d,%r8d
-       movl    %eax,%r13d
-       addl    %r8d,%r14d
-       movdqa  %xmm6,0(%rsp)
-       rorl    $14,%r13d
-       movl    %r14d,%r8d
-       movdqa  %xmm2,%xmm4
-       movl    %ebx,%r12d
-       movdqa  %xmm0,%xmm7
-       xorl    %eax,%r13d
-       rorl    $9,%r14d
-       xorl    %ecx,%r12d
-.byte  102,15,58,15,225,4
-       rorl    $5,%r13d
-       xorl    %r8d,%r14d
-.byte  102,15,58,15,251,4
-       andl    %eax,%r12d
-       xorl    %eax,%r13d
-       addl    16(%rsp),%edx
-       movl    %r8d,%r15d
-       rorl    $11,%r14d
-       xorl    %ecx,%r12d
-       movdqa  %xmm4,%xmm5
-       xorl    %r9d,%r15d
-       movdqa  %xmm4,%xmm6
-       rorl    $6,%r13d
-       addl    %r12d,%edx
-       andl    %r15d,%edi
-       psrld   $3,%xmm4
-       xorl    %r8d,%r14d
-       addl    %r13d,%edx
-       xorl    %r9d,%edi
-       paddd   %xmm7,%xmm1
-       addl    %edx,%r11d
-       rorl    $2,%r14d
-       addl    %edi,%edx
-       psrld   $7,%xmm6
-       movl    %r11d,%r13d
-       addl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%edx
-       pshufd  $250,%xmm0,%xmm7
-       movl    %eax,%r12d
-       pslld   $14,%xmm5
-       xorl    %r11d,%r13d
-       pxor    %xmm6,%xmm4
-       rorl    $9,%r14d
-       xorl    %ebx,%r12d
-       psrld   $11,%xmm6
-       rorl    $5,%r13d
-       xorl    %edx,%r14d
-       pxor    %xmm5,%xmm4
-       andl    %r11d,%r12d
-       xorl    %r11d,%r13d
-       pslld   $11,%xmm5
-       addl    20(%rsp),%ecx
-       pxor    %xmm6,%xmm4
-       movl    %edx,%edi
-       rorl    $11,%r14d
-       xorl    %ebx,%r12d
-       movdqa  %xmm7,%xmm6
-       xorl    %r8d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%ecx
-       pxor    %xmm5,%xmm4
-       andl    %edi,%r15d
-       xorl    %edx,%r14d
-       psrld   $10,%xmm7
-       addl    %r13d,%ecx
-       xorl    %r8d,%r15d
-       paddd   %xmm4,%xmm1
-       addl    %ecx,%r10d
-       rorl    $2,%r14d
-       addl    %r15d,%ecx
-       movl    %r10d,%r13d
-       psrlq   $17,%xmm6
-       addl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ecx
-       movl    %r11d,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %r10d,%r13d
-       rorl    $9,%r14d
-       psrlq   $2,%xmm6
-       xorl    %eax,%r12d
-       rorl    $5,%r13d
-       xorl    %ecx,%r14d
-       pxor    %xmm6,%xmm7
-       andl    %r10d,%r12d
-       xorl    %r10d,%r13d
-       addl    24(%rsp),%ebx
-.byte  102,65,15,56,0,248
-       movl    %ecx,%r15d
-       rorl    $11,%r14d
-       xorl    %eax,%r12d
-       xorl    %edx,%r15d
-       rorl    $6,%r13d
-       paddd   %xmm7,%xmm1
-       addl    %r12d,%ebx
-       pshufd  $80,%xmm1,%xmm7
-       andl    %r15d,%edi
-       xorl    %ecx,%r14d
-       addl    %r13d,%ebx
-       xorl    %edx,%edi
-       movdqa  %xmm7,%xmm6
-       addl    %ebx,%r9d
-       rorl    $2,%r14d
-       addl    %edi,%ebx
-       psrld   $10,%xmm7
-       movl    %r9d,%r13d
-       psrlq   $17,%xmm6
-       addl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ebx
-       movl    %r10d,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %r9d,%r13d
-       rorl    $9,%r14d
-       xorl    %r11d,%r12d
-       psrlq   $2,%xmm6
-       rorl    $5,%r13d
-       xorl    %ebx,%r14d
-       andl    %r9d,%r12d
-       xorl    %r9d,%r13d
-       pxor    %xmm6,%xmm7
-       addl    28(%rsp),%eax
-       movl    %ebx,%edi
-       movdqa  32(%rbp),%xmm6
-       rorl    $11,%r14d
-       xorl    %r11d,%r12d
-.byte  102,65,15,56,0,249
-       xorl    %ecx,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%eax
-       andl    %edi,%r15d
-       xorl    %ebx,%r14d
-       paddd   %xmm7,%xmm1
-       addl    %r13d,%eax
-       xorl    %ecx,%r15d
-       addl    %eax,%r8d
-       paddd   %xmm1,%xmm6
-       rorl    $2,%r14d
-       addl    %r15d,%eax
-       movl    %r8d,%r13d
-       addl    %eax,%r14d
-       movdqa  %xmm6,16(%rsp)
-       rorl    $14,%r13d
-       movl    %r14d,%eax
-       movdqa  %xmm3,%xmm4
-       movl    %r9d,%r12d
-       movdqa  %xmm1,%xmm7
-       xorl    %r8d,%r13d
-       rorl    $9,%r14d
-       xorl    %r10d,%r12d
-.byte  102,15,58,15,226,4
-       rorl    $5,%r13d
-       xorl    %eax,%r14d
-.byte  102,15,58,15,248,4
-       andl    %r8d,%r12d
-       xorl    %r8d,%r13d
-       addl    32(%rsp),%r11d
-       movl    %eax,%r15d
-       rorl    $11,%r14d
-       xorl    %r10d,%r12d
-       movdqa  %xmm4,%xmm5
-       xorl    %ebx,%r15d
-       movdqa  %xmm4,%xmm6
-       rorl    $6,%r13d
-       addl    %r12d,%r11d
-       andl    %r15d,%edi
-       psrld   $3,%xmm4
-       xorl    %eax,%r14d
-       addl    %r13d,%r11d
-       xorl    %ebx,%edi
-       paddd   %xmm7,%xmm2
-       addl    %r11d,%edx
-       rorl    $2,%r14d
-       addl    %edi,%r11d
-       psrld   $7,%xmm6
-       movl    %edx,%r13d
-       addl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r11d
-       pshufd  $250,%xmm1,%xmm7
-       movl    %r8d,%r12d
-       pslld   $14,%xmm5
-       xorl    %edx,%r13d
-       pxor    %xmm6,%xmm4
-       rorl    $9,%r14d
-       xorl    %r9d,%r12d
-       psrld   $11,%xmm6
-       rorl    $5,%r13d
-       xorl    %r11d,%r14d
-       pxor    %xmm5,%xmm4
-       andl    %edx,%r12d
-       xorl    %edx,%r13d
-       pslld   $11,%xmm5
-       addl    36(%rsp),%r10d
-       pxor    %xmm6,%xmm4
-       movl    %r11d,%edi
-       rorl    $11,%r14d
-       xorl    %r9d,%r12d
-       movdqa  %xmm7,%xmm6
-       xorl    %eax,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r10d
-       pxor    %xmm5,%xmm4
-       andl    %edi,%r15d
-       xorl    %r11d,%r14d
-       psrld   $10,%xmm7
-       addl    %r13d,%r10d
-       xorl    %eax,%r15d
-       paddd   %xmm4,%xmm2
-       addl    %r10d,%ecx
-       rorl    $2,%r14d
-       addl    %r15d,%r10d
-       movl    %ecx,%r13d
-       psrlq   $17,%xmm6
-       addl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r10d
-       movl    %edx,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %ecx,%r13d
-       rorl    $9,%r14d
-       psrlq   $2,%xmm6
-       xorl    %r8d,%r12d
-       rorl    $5,%r13d
-       xorl    %r10d,%r14d
-       pxor    %xmm6,%xmm7
-       andl    %ecx,%r12d
-       xorl    %ecx,%r13d
-       addl    40(%rsp),%r9d
-.byte  102,65,15,56,0,248
-       movl    %r10d,%r15d
-       rorl    $11,%r14d
-       xorl    %r8d,%r12d
-       xorl    %r11d,%r15d
-       rorl    $6,%r13d
-       paddd   %xmm7,%xmm2
-       addl    %r12d,%r9d
-       pshufd  $80,%xmm2,%xmm7
-       andl    %r15d,%edi
-       xorl    %r10d,%r14d
-       addl    %r13d,%r9d
-       xorl    %r11d,%edi
-       movdqa  %xmm7,%xmm6
-       addl    %r9d,%ebx
-       rorl    $2,%r14d
-       addl    %edi,%r9d
-       psrld   $10,%xmm7
-       movl    %ebx,%r13d
-       psrlq   $17,%xmm6
-       addl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r9d
-       movl    %ecx,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %ebx,%r13d
-       rorl    $9,%r14d
-       xorl    %edx,%r12d
-       psrlq   $2,%xmm6
-       rorl    $5,%r13d
-       xorl    %r9d,%r14d
-       andl    %ebx,%r12d
-       xorl    %ebx,%r13d
-       pxor    %xmm6,%xmm7
-       addl    44(%rsp),%r8d
-       movl    %r9d,%edi
-       movdqa  64(%rbp),%xmm6
-       rorl    $11,%r14d
-       xorl    %edx,%r12d
-.byte  102,65,15,56,0,249
-       xorl    %r10d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r8d
-       andl    %edi,%r15d
-       xorl    %r9d,%r14d
-       paddd   %xmm7,%xmm2
-       addl    %r13d,%r8d
-       xorl    %r10d,%r15d
-       addl    %r8d,%eax
-       paddd   %xmm2,%xmm6
-       rorl    $2,%r14d
-       addl    %r15d,%r8d
-       movl    %eax,%r13d
-       addl    %r8d,%r14d
-       movdqa  %xmm6,32(%rsp)
-       rorl    $14,%r13d
-       movl    %r14d,%r8d
-       movdqa  %xmm0,%xmm4
-       movl    %ebx,%r12d
-       movdqa  %xmm2,%xmm7
-       xorl    %eax,%r13d
-       rorl    $9,%r14d
-       xorl    %ecx,%r12d
-.byte  102,15,58,15,227,4
-       rorl    $5,%r13d
-       xorl    %r8d,%r14d
-.byte  102,15,58,15,249,4
-       andl    %eax,%r12d
-       xorl    %eax,%r13d
-       addl    48(%rsp),%edx
-       movl    %r8d,%r15d
-       rorl    $11,%r14d
-       xorl    %ecx,%r12d
-       movdqa  %xmm4,%xmm5
-       xorl    %r9d,%r15d
-       movdqa  %xmm4,%xmm6
-       rorl    $6,%r13d
-       addl    %r12d,%edx
-       andl    %r15d,%edi
-       psrld   $3,%xmm4
-       xorl    %r8d,%r14d
-       addl    %r13d,%edx
-       xorl    %r9d,%edi
-       paddd   %xmm7,%xmm3
-       addl    %edx,%r11d
-       rorl    $2,%r14d
-       addl    %edi,%edx
-       psrld   $7,%xmm6
-       movl    %r11d,%r13d
-       addl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%edx
-       pshufd  $250,%xmm2,%xmm7
-       movl    %eax,%r12d
-       pslld   $14,%xmm5
-       xorl    %r11d,%r13d
-       pxor    %xmm6,%xmm4
-       rorl    $9,%r14d
-       xorl    %ebx,%r12d
-       psrld   $11,%xmm6
-       rorl    $5,%r13d
-       xorl    %edx,%r14d
-       pxor    %xmm5,%xmm4
-       andl    %r11d,%r12d
-       xorl    %r11d,%r13d
-       pslld   $11,%xmm5
-       addl    52(%rsp),%ecx
-       pxor    %xmm6,%xmm4
-       movl    %edx,%edi
-       rorl    $11,%r14d
-       xorl    %ebx,%r12d
-       movdqa  %xmm7,%xmm6
-       xorl    %r8d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%ecx
-       pxor    %xmm5,%xmm4
-       andl    %edi,%r15d
-       xorl    %edx,%r14d
-       psrld   $10,%xmm7
-       addl    %r13d,%ecx
-       xorl    %r8d,%r15d
-       paddd   %xmm4,%xmm3
-       addl    %ecx,%r10d
-       rorl    $2,%r14d
-       addl    %r15d,%ecx
-       movl    %r10d,%r13d
-       psrlq   $17,%xmm6
-       addl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ecx
-       movl    %r11d,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %r10d,%r13d
-       rorl    $9,%r14d
-       psrlq   $2,%xmm6
-       xorl    %eax,%r12d
-       rorl    $5,%r13d
-       xorl    %ecx,%r14d
-       pxor    %xmm6,%xmm7
-       andl    %r10d,%r12d
-       xorl    %r10d,%r13d
-       addl    56(%rsp),%ebx
-.byte  102,65,15,56,0,248
-       movl    %ecx,%r15d
-       rorl    $11,%r14d
-       xorl    %eax,%r12d
-       xorl    %edx,%r15d
-       rorl    $6,%r13d
-       paddd   %xmm7,%xmm3
-       addl    %r12d,%ebx
-       pshufd  $80,%xmm3,%xmm7
-       andl    %r15d,%edi
-       xorl    %ecx,%r14d
-       addl    %r13d,%ebx
-       xorl    %edx,%edi
-       movdqa  %xmm7,%xmm6
-       addl    %ebx,%r9d
-       rorl    $2,%r14d
-       addl    %edi,%ebx
-       psrld   $10,%xmm7
-       movl    %r9d,%r13d
-       psrlq   $17,%xmm6
-       addl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ebx
-       movl    %r10d,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %r9d,%r13d
-       rorl    $9,%r14d
-       xorl    %r11d,%r12d
-       psrlq   $2,%xmm6
-       rorl    $5,%r13d
-       xorl    %ebx,%r14d
-       andl    %r9d,%r12d
-       xorl    %r9d,%r13d
-       pxor    %xmm6,%xmm7
-       addl    60(%rsp),%eax
-       movl    %ebx,%edi
-       movdqa  96(%rbp),%xmm6
-       rorl    $11,%r14d
-       xorl    %r11d,%r12d
-.byte  102,65,15,56,0,249
-       xorl    %ecx,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%eax
-       andl    %edi,%r15d
-       xorl    %ebx,%r14d
-       paddd   %xmm7,%xmm3
-       addl    %r13d,%eax
-       xorl    %ecx,%r15d
-       addl    %eax,%r8d
-       paddd   %xmm3,%xmm6
-       rorl    $2,%r14d
-       addl    %r15d,%eax
-       movl    %r8d,%r13d
-       addl    %eax,%r14d
-       movdqa  %xmm6,48(%rsp)
-       cmpb    $0,131(%rbp)
-       jne     .Lssse3_00_47
-       rorl    $14,%r13d
-       movl    %r14d,%eax
-       movl    %r9d,%r12d
-       xorl    %r8d,%r13d
-       rorl    $9,%r14d
-       xorl    %r10d,%r12d
-       rorl    $5,%r13d
-       xorl    %eax,%r14d
-       andl    %r8d,%r12d
-       xorl    %r8d,%r13d
-       addl    0(%rsp),%r11d
-       movl    %eax,%r15d
-       rorl    $11,%r14d
-       xorl    %r10d,%r12d
-       xorl    %ebx,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%r11d
-       andl    %r15d,%edi
-       xorl    %eax,%r14d
-       addl    %r13d,%r11d
-       xorl    %ebx,%edi
-       addl    %r11d,%edx
-       rorl    $2,%r14d
-       addl    %edi,%r11d
-       movl    %edx,%r13d
-       addl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r11d
-       movl    %r8d,%r12d
-       xorl    %edx,%r13d
-       rorl    $9,%r14d
-       xorl    %r9d,%r12d
-       rorl    $5,%r13d
-       xorl    %r11d,%r14d
-       andl    %edx,%r12d
-       xorl    %edx,%r13d
-       addl    4(%rsp),%r10d
-       movl    %r11d,%edi
-       rorl    $11,%r14d
-       xorl    %r9d,%r12d
-       xorl    %eax,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r10d
-       andl    %edi,%r15d
-       xorl    %r11d,%r14d
-       addl    %r13d,%r10d
-       xorl    %eax,%r15d
-       addl    %r10d,%ecx
-       rorl    $2,%r14d
-       addl    %r15d,%r10d
-       movl    %ecx,%r13d
-       addl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r10d
-       movl    %edx,%r12d
-       xorl    %ecx,%r13d
-       rorl    $9,%r14d
-       xorl    %r8d,%r12d
-       rorl    $5,%r13d
-       xorl    %r10d,%r14d
-       andl    %ecx,%r12d
-       xorl    %ecx,%r13d
-       addl    8(%rsp),%r9d
-       movl    %r10d,%r15d
-       rorl    $11,%r14d
-       xorl    %r8d,%r12d
-       xorl    %r11d,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%r9d
-       andl    %r15d,%edi
-       xorl    %r10d,%r14d
-       addl    %r13d,%r9d
-       xorl    %r11d,%edi
-       addl    %r9d,%ebx
-       rorl    $2,%r14d
-       addl    %edi,%r9d
-       movl    %ebx,%r13d
-       addl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r9d
-       movl    %ecx,%r12d
-       xorl    %ebx,%r13d
-       rorl    $9,%r14d
-       xorl    %edx,%r12d
-       rorl    $5,%r13d
-       xorl    %r9d,%r14d
-       andl    %ebx,%r12d
-       xorl    %ebx,%r13d
-       addl    12(%rsp),%r8d
-       movl    %r9d,%edi
-       rorl    $11,%r14d
-       xorl    %edx,%r12d
-       xorl    %r10d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r8d
-       andl    %edi,%r15d
-       xorl    %r9d,%r14d
-       addl    %r13d,%r8d
-       xorl    %r10d,%r15d
-       addl    %r8d,%eax
-       rorl    $2,%r14d
-       addl    %r15d,%r8d
-       movl    %eax,%r13d
-       addl    %r8d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r8d
-       movl    %ebx,%r12d
-       xorl    %eax,%r13d
-       rorl    $9,%r14d
-       xorl    %ecx,%r12d
-       rorl    $5,%r13d
-       xorl    %r8d,%r14d
-       andl    %eax,%r12d
-       xorl    %eax,%r13d
-       addl    16(%rsp),%edx
-       movl    %r8d,%r15d
-       rorl    $11,%r14d
-       xorl    %ecx,%r12d
-       xorl    %r9d,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%edx
-       andl    %r15d,%edi
-       xorl    %r8d,%r14d
-       addl    %r13d,%edx
-       xorl    %r9d,%edi
-       addl    %edx,%r11d
-       rorl    $2,%r14d
-       addl    %edi,%edx
-       movl    %r11d,%r13d
-       addl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%edx
-       movl    %eax,%r12d
-       xorl    %r11d,%r13d
-       rorl    $9,%r14d
-       xorl    %ebx,%r12d
-       rorl    $5,%r13d
-       xorl    %edx,%r14d
-       andl    %r11d,%r12d
-       xorl    %r11d,%r13d
-       addl    20(%rsp),%ecx
-       movl    %edx,%edi
-       rorl    $11,%r14d
-       xorl    %ebx,%r12d
-       xorl    %r8d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%ecx
-       andl    %edi,%r15d
-       xorl    %edx,%r14d
-       addl    %r13d,%ecx
-       xorl    %r8d,%r15d
-       addl    %ecx,%r10d
-       rorl    $2,%r14d
-       addl    %r15d,%ecx
-       movl    %r10d,%r13d
-       addl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ecx
-       movl    %r11d,%r12d
-       xorl    %r10d,%r13d
-       rorl    $9,%r14d
-       xorl    %eax,%r12d
-       rorl    $5,%r13d
-       xorl    %ecx,%r14d
-       andl    %r10d,%r12d
-       xorl    %r10d,%r13d
-       addl    24(%rsp),%ebx
-       movl    %ecx,%r15d
-       rorl    $11,%r14d
-       xorl    %eax,%r12d
-       xorl    %edx,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%ebx
-       andl    %r15d,%edi
-       xorl    %ecx,%r14d
-       addl    %r13d,%ebx
-       xorl    %edx,%edi
-       addl    %ebx,%r9d
-       rorl    $2,%r14d
-       addl    %edi,%ebx
-       movl    %r9d,%r13d
-       addl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ebx
-       movl    %r10d,%r12d
-       xorl    %r9d,%r13d
-       rorl    $9,%r14d
-       xorl    %r11d,%r12d
-       rorl    $5,%r13d
-       xorl    %ebx,%r14d
-       andl    %r9d,%r12d
-       xorl    %r9d,%r13d
-       addl    28(%rsp),%eax
-       movl    %ebx,%edi
-       rorl    $11,%r14d
-       xorl    %r11d,%r12d
-       xorl    %ecx,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%eax
-       andl    %edi,%r15d
-       xorl    %ebx,%r14d
-       addl    %r13d,%eax
-       xorl    %ecx,%r15d
-       addl    %eax,%r8d
-       rorl    $2,%r14d
-       addl    %r15d,%eax
-       movl    %r8d,%r13d
-       addl    %eax,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%eax
-       movl    %r9d,%r12d
-       xorl    %r8d,%r13d
-       rorl    $9,%r14d
-       xorl    %r10d,%r12d
-       rorl    $5,%r13d
-       xorl    %eax,%r14d
-       andl    %r8d,%r12d
-       xorl    %r8d,%r13d
-       addl    32(%rsp),%r11d
-       movl    %eax,%r15d
-       rorl    $11,%r14d
-       xorl    %r10d,%r12d
-       xorl    %ebx,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%r11d
-       andl    %r15d,%edi
-       xorl    %eax,%r14d
-       addl    %r13d,%r11d
-       xorl    %ebx,%edi
-       addl    %r11d,%edx
-       rorl    $2,%r14d
-       addl    %edi,%r11d
-       movl    %edx,%r13d
-       addl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r11d
-       movl    %r8d,%r12d
-       xorl    %edx,%r13d
-       rorl    $9,%r14d
-       xorl    %r9d,%r12d
-       rorl    $5,%r13d
-       xorl    %r11d,%r14d
-       andl    %edx,%r12d
-       xorl    %edx,%r13d
-       addl    36(%rsp),%r10d
-       movl    %r11d,%edi
-       rorl    $11,%r14d
-       xorl    %r9d,%r12d
-       xorl    %eax,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r10d
-       andl    %edi,%r15d
-       xorl    %r11d,%r14d
-       addl    %r13d,%r10d
-       xorl    %eax,%r15d
-       addl    %r10d,%ecx
-       rorl    $2,%r14d
-       addl    %r15d,%r10d
-       movl    %ecx,%r13d
-       addl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r10d
-       movl    %edx,%r12d
-       xorl    %ecx,%r13d
-       rorl    $9,%r14d
-       xorl    %r8d,%r12d
-       rorl    $5,%r13d
-       xorl    %r10d,%r14d
-       andl    %ecx,%r12d
-       xorl    %ecx,%r13d
-       addl    40(%rsp),%r9d
-       movl    %r10d,%r15d
-       rorl    $11,%r14d
-       xorl    %r8d,%r12d
-       xorl    %r11d,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%r9d
-       andl    %r15d,%edi
-       xorl    %r10d,%r14d
-       addl    %r13d,%r9d
-       xorl    %r11d,%edi
-       addl    %r9d,%ebx
-       rorl    $2,%r14d
-       addl    %edi,%r9d
-       movl    %ebx,%r13d
-       addl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r9d
-       movl    %ecx,%r12d
-       xorl    %ebx,%r13d
-       rorl    $9,%r14d
-       xorl    %edx,%r12d
-       rorl    $5,%r13d
-       xorl    %r9d,%r14d
-       andl    %ebx,%r12d
-       xorl    %ebx,%r13d
-       addl    44(%rsp),%r8d
-       movl    %r9d,%edi
-       rorl    $11,%r14d
-       xorl    %edx,%r12d
-       xorl    %r10d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r8d
-       andl    %edi,%r15d
-       xorl    %r9d,%r14d
-       addl    %r13d,%r8d
-       xorl    %r10d,%r15d
-       addl    %r8d,%eax
-       rorl    $2,%r14d
-       addl    %r15d,%r8d
-       movl    %eax,%r13d
-       addl    %r8d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r8d
-       movl    %ebx,%r12d
-       xorl    %eax,%r13d
-       rorl    $9,%r14d
-       xorl    %ecx,%r12d
-       rorl    $5,%r13d
-       xorl    %r8d,%r14d
-       andl    %eax,%r12d
-       xorl    %eax,%r13d
-       addl    48(%rsp),%edx
-       movl    %r8d,%r15d
-       rorl    $11,%r14d
-       xorl    %ecx,%r12d
-       xorl    %r9d,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%edx
-       andl    %r15d,%edi
-       xorl    %r8d,%r14d
-       addl    %r13d,%edx
-       xorl    %r9d,%edi
-       addl    %edx,%r11d
-       rorl    $2,%r14d
-       addl    %edi,%edx
-       movl    %r11d,%r13d
-       addl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%edx
-       movl    %eax,%r12d
-       xorl    %r11d,%r13d
-       rorl    $9,%r14d
-       xorl    %ebx,%r12d
-       rorl    $5,%r13d
-       xorl    %edx,%r14d
-       andl    %r11d,%r12d
-       xorl    %r11d,%r13d
-       addl    52(%rsp),%ecx
-       movl    %edx,%edi
-       rorl    $11,%r14d
-       xorl    %ebx,%r12d
-       xorl    %r8d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%ecx
-       andl    %edi,%r15d
-       xorl    %edx,%r14d
-       addl    %r13d,%ecx
-       xorl    %r8d,%r15d
-       addl    %ecx,%r10d
-       rorl    $2,%r14d
-       addl    %r15d,%ecx
-       movl    %r10d,%r13d
-       addl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ecx
-       movl    %r11d,%r12d
-       xorl    %r10d,%r13d
-       rorl    $9,%r14d
-       xorl    %eax,%r12d
-       rorl    $5,%r13d
-       xorl    %ecx,%r14d
-       andl    %r10d,%r12d
-       xorl    %r10d,%r13d
-       addl    56(%rsp),%ebx
-       movl    %ecx,%r15d
-       rorl    $11,%r14d
-       xorl    %eax,%r12d
-       xorl    %edx,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%ebx
-       andl    %r15d,%edi
-       xorl    %ecx,%r14d
-       addl    %r13d,%ebx
-       xorl    %edx,%edi
-       addl    %ebx,%r9d
-       rorl    $2,%r14d
-       addl    %edi,%ebx
-       movl    %r9d,%r13d
-       addl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ebx
-       movl    %r10d,%r12d
-       xorl    %r9d,%r13d
-       rorl    $9,%r14d
-       xorl    %r11d,%r12d
-       rorl    $5,%r13d
-       xorl    %ebx,%r14d
-       andl    %r9d,%r12d
-       xorl    %r9d,%r13d
-       addl    60(%rsp),%eax
-       movl    %ebx,%edi
-       rorl    $11,%r14d
-       xorl    %r11d,%r12d
-       xorl    %ecx,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%eax
-       andl    %edi,%r15d
-       xorl    %ebx,%r14d
-       addl    %r13d,%eax
-       xorl    %ecx,%r15d
-       addl    %eax,%r8d
-       rorl    $2,%r14d
-       addl    %r15d,%eax
-       movl    %r8d,%r13d
-       addl    %eax,%r14d
-       movq    64+0(%rsp),%rdi
-       movl    %r14d,%eax
-
-       addl    0(%rdi),%eax
-       leaq    64(%rsi),%rsi
-       addl    4(%rdi),%ebx
-       addl    8(%rdi),%ecx
-       addl    12(%rdi),%edx
-       addl    16(%rdi),%r8d
-       addl    20(%rdi),%r9d
-       addl    24(%rdi),%r10d
-       addl    28(%rdi),%r11d
-
-       cmpq    64+16(%rsp),%rsi
-
-       movl    %eax,0(%rdi)
-       movl    %ebx,4(%rdi)
-       movl    %ecx,8(%rdi)
-       movl    %edx,12(%rdi)
-       movl    %r8d,16(%rdi)
-       movl    %r9d,20(%rdi)
-       movl    %r10d,24(%rdi)
-       movl    %r11d,28(%rdi)
-       jb      .Lloop_ssse3
-
-       movq    64+24(%rsp),%rsi
-       movaps  64+32(%rsp),%xmm6
-       movaps  64+48(%rsp),%xmm7
-       movaps  64+64(%rsp),%xmm8
-       movaps  64+80(%rsp),%xmm9
-       movq    (%rsi),%r15
-       movq    8(%rsi),%r14
-       movq    16(%rsi),%r13
-       movq    24(%rsi),%r12
-       movq    32(%rsi),%rbp
-       movq    40(%rsi),%rbx
-       leaq    48(%rsi),%rsp
-.Lepilogue_ssse3:
+.Lxop_00_47:
+       addq    $256,%rbp
+       vpalignr        $8,%xmm0,%xmm1,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%rax
+       vpalignr        $8,%xmm4,%xmm5,%xmm11
+       movq    %r9,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %rax,%r14
+       vpaddq  %xmm11,%xmm0,%xmm0
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    0(%rsp),%r11
+       movq    %rax,%r15
+.byte  143,72,120,195,209,7
+       xorq    %r10,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,223,3
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rbx,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm7,%xmm10
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       vpaddq  %xmm8,%xmm0,%xmm0
+       movq    %rdx,%r13
+       addq    %r11,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%r11
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %r8,%r12
+       rorq    $5,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       vpaddq  %xmm11,%xmm0,%xmm0
+       addq    8(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       rorq    $6,%r14
+       vpaddq  -128(%rbp),%xmm0,%xmm10
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       rorq    $28,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       vmovdqa %xmm10,0(%rsp)
+       vpalignr        $8,%xmm1,%xmm2,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%r10
+       vpalignr        $8,%xmm5,%xmm6,%xmm11
+       movq    %rdx,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %r10,%r14
+       vpaddq  %xmm11,%xmm1,%xmm1
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    16(%rsp),%r9
+       movq    %r10,%r15
+.byte  143,72,120,195,209,7
+       xorq    %r8,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,216,3
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r11,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm0,%xmm10
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       vpaddq  %xmm8,%xmm1,%xmm1
+       movq    %rbx,%r13
+       addq    %r9,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%r9
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %rcx,%r12
+       rorq    $5,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       vpaddq  %xmm11,%xmm1,%xmm1
+       addq    24(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       rorq    $6,%r14
+       vpaddq  -96(%rbp),%xmm1,%xmm10
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       rorq    $28,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       vmovdqa %xmm10,16(%rsp)
+       vpalignr        $8,%xmm2,%xmm3,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%r8
+       vpalignr        $8,%xmm6,%xmm7,%xmm11
+       movq    %rbx,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %r8,%r14
+       vpaddq  %xmm11,%xmm2,%xmm2
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    32(%rsp),%rdx
+       movq    %r8,%r15
+.byte  143,72,120,195,209,7
+       xorq    %rcx,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,217,3
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r9,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm1,%xmm10
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       vpaddq  %xmm8,%xmm2,%xmm2
+       movq    %r11,%r13
+       addq    %rdx,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%rdx
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %rax,%r12
+       rorq    $5,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       vpaddq  %xmm11,%xmm2,%xmm2
+       addq    40(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       rorq    $6,%r14
+       vpaddq  -64(%rbp),%xmm2,%xmm10
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       rorq    $28,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       vmovdqa %xmm10,32(%rsp)
+       vpalignr        $8,%xmm3,%xmm4,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%rcx
+       vpalignr        $8,%xmm7,%xmm0,%xmm11
+       movq    %r11,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %rcx,%r14
+       vpaddq  %xmm11,%xmm3,%xmm3
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    48(%rsp),%rbx
+       movq    %rcx,%r15
+.byte  143,72,120,195,209,7
+       xorq    %rax,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,218,3
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rdx,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm2,%xmm10
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       vpaddq  %xmm8,%xmm3,%xmm3
+       movq    %r9,%r13
+       addq    %rbx,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%rbx
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %r10,%r12
+       rorq    $5,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       vpaddq  %xmm11,%xmm3,%xmm3
+       addq    56(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       rorq    $6,%r14
+       vpaddq  -32(%rbp),%xmm3,%xmm10
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       rorq    $28,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       vmovdqa %xmm10,48(%rsp)
+       vpalignr        $8,%xmm4,%xmm5,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%rax
+       vpalignr        $8,%xmm0,%xmm1,%xmm11
+       movq    %r9,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %rax,%r14
+       vpaddq  %xmm11,%xmm4,%xmm4
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    64(%rsp),%r11
+       movq    %rax,%r15
+.byte  143,72,120,195,209,7
+       xorq    %r10,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,219,3
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rbx,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm3,%xmm10
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       vpaddq  %xmm8,%xmm4,%xmm4
+       movq    %rdx,%r13
+       addq    %r11,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%r11
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %r8,%r12
+       rorq    $5,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       vpaddq  %xmm11,%xmm4,%xmm4
+       addq    72(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       rorq    $6,%r14
+       vpaddq  0(%rbp),%xmm4,%xmm10
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       rorq    $28,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       vmovdqa %xmm10,64(%rsp)
+       vpalignr        $8,%xmm5,%xmm6,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%r10
+       vpalignr        $8,%xmm1,%xmm2,%xmm11
+       movq    %rdx,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %r10,%r14
+       vpaddq  %xmm11,%xmm5,%xmm5
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    80(%rsp),%r9
+       movq    %r10,%r15
+.byte  143,72,120,195,209,7
+       xorq    %r8,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,220,3
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r11,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm4,%xmm10
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       vpaddq  %xmm8,%xmm5,%xmm5
+       movq    %rbx,%r13
+       addq    %r9,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%r9
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %rcx,%r12
+       rorq    $5,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       vpaddq  %xmm11,%xmm5,%xmm5
+       addq    88(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       rorq    $6,%r14
+       vpaddq  32(%rbp),%xmm5,%xmm10
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       rorq    $28,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       vmovdqa %xmm10,80(%rsp)
+       vpalignr        $8,%xmm6,%xmm7,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%r8
+       vpalignr        $8,%xmm2,%xmm3,%xmm11
+       movq    %rbx,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %r8,%r14
+       vpaddq  %xmm11,%xmm6,%xmm6
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    96(%rsp),%rdx
+       movq    %r8,%r15
+.byte  143,72,120,195,209,7
+       xorq    %rcx,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,221,3
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r9,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm5,%xmm10
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       vpaddq  %xmm8,%xmm6,%xmm6
+       movq    %r11,%r13
+       addq    %rdx,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%rdx
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %rax,%r12
+       rorq    $5,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       vpaddq  %xmm11,%xmm6,%xmm6
+       addq    104(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       rorq    $6,%r14
+       vpaddq  64(%rbp),%xmm6,%xmm10
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       rorq    $28,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       vmovdqa %xmm10,96(%rsp)
+       vpalignr        $8,%xmm7,%xmm0,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%rcx
+       vpalignr        $8,%xmm3,%xmm4,%xmm11
+       movq    %r11,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %rcx,%r14
+       vpaddq  %xmm11,%xmm7,%xmm7
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    112(%rsp),%rbx
+       movq    %rcx,%r15
+.byte  143,72,120,195,209,7
+       xorq    %rax,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,222,3
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rdx,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm6,%xmm10
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       vpaddq  %xmm8,%xmm7,%xmm7
+       movq    %r9,%r13
+       addq    %rbx,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%rbx
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %r10,%r12
+       rorq    $5,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       vpaddq  %xmm11,%xmm7,%xmm7
+       addq    120(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       rorq    $6,%r14
+       vpaddq  96(%rbp),%xmm7,%xmm10
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       rorq    $28,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       vmovdqa %xmm10,112(%rsp)
+       cmpb    $0,135(%rbp)
+       jne     .Lxop_00_47
+       rorq    $23,%r13
+       movq    %r14,%rax
+       movq    %r9,%r12
+       rorq    $5,%r14
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       rorq    $4,%r13
+       xorq    %rax,%r14
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    0(%rsp),%r11
+       movq    %rax,%r15
+       xorq    %r10,%r12
+       rorq    $6,%r14
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       xorq    %rbx,%rdi
+       rorq    $28,%r14
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       rorq    $23,%r13
+       movq    %r14,%r11
+       movq    %r8,%r12
+       rorq    $5,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       rorq    $4,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       addq    8(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       rorq    $6,%r14
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       rorq    $28,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       rorq    $23,%r13
+       movq    %r14,%r10
+       movq    %rdx,%r12
+       rorq    $5,%r14
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       rorq    $4,%r13
+       xorq    %r10,%r14
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    16(%rsp),%r9
+       movq    %r10,%r15
+       xorq    %r8,%r12
+       rorq    $6,%r14
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       xorq    %r11,%rdi
+       rorq    $28,%r14
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       rorq    $23,%r13
+       movq    %r14,%r9
+       movq    %rcx,%r12
+       rorq    $5,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       rorq    $4,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       addq    24(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       rorq    $6,%r14
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       rorq    $28,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       rorq    $23,%r13
+       movq    %r14,%r8
+       movq    %rbx,%r12
+       rorq    $5,%r14
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       rorq    $4,%r13
+       xorq    %r8,%r14
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    32(%rsp),%rdx
+       movq    %r8,%r15
+       xorq    %rcx,%r12
+       rorq    $6,%r14
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       xorq    %r9,%rdi
+       rorq    $28,%r14
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rdx
+       movq    %rax,%r12
+       rorq    $5,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       rorq    $4,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       addq    40(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       rorq    $6,%r14
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       rorq    $28,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rcx
+       movq    %r11,%r12
+       rorq    $5,%r14
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       rorq    $4,%r13
+       xorq    %rcx,%r14
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    48(%rsp),%rbx
+       movq    %rcx,%r15
+       xorq    %rax,%r12
+       rorq    $6,%r14
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       xorq    %rdx,%rdi
+       rorq    $28,%r14
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rbx
+       movq    %r10,%r12
+       rorq    $5,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       rorq    $4,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       addq    56(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       rorq    $6,%r14
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       rorq    $28,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       rorq    $23,%r13
+       movq    %r14,%rax
+       movq    %r9,%r12
+       rorq    $5,%r14
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       rorq    $4,%r13
+       xorq    %rax,%r14
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    64(%rsp),%r11
+       movq    %rax,%r15
+       xorq    %r10,%r12
+       rorq    $6,%r14
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       xorq    %rbx,%rdi
+       rorq    $28,%r14
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       rorq    $23,%r13
+       movq    %r14,%r11
+       movq    %r8,%r12
+       rorq    $5,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       rorq    $4,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       addq    72(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       rorq    $6,%r14
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       rorq    $28,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       rorq    $23,%r13
+       movq    %r14,%r10
+       movq    %rdx,%r12
+       rorq    $5,%r14
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       rorq    $4,%r13
+       xorq    %r10,%r14
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    80(%rsp),%r9
+       movq    %r10,%r15
+       xorq    %r8,%r12
+       rorq    $6,%r14
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       xorq    %r11,%rdi
+       rorq    $28,%r14
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       rorq    $23,%r13
+       movq    %r14,%r9
+       movq    %rcx,%r12
+       rorq    $5,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       rorq    $4,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       addq    88(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       rorq    $6,%r14
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       rorq    $28,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       rorq    $23,%r13
+       movq    %r14,%r8
+       movq    %rbx,%r12
+       rorq    $5,%r14
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       rorq    $4,%r13
+       xorq    %r8,%r14
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    96(%rsp),%rdx
+       movq    %r8,%r15
+       xorq    %rcx,%r12
+       rorq    $6,%r14
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       xorq    %r9,%rdi
+       rorq    $28,%r14
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rdx
+       movq    %rax,%r12
+       rorq    $5,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       rorq    $4,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       addq    104(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       rorq    $6,%r14
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       rorq    $28,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rcx
+       movq    %r11,%r12
+       rorq    $5,%r14
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       rorq    $4,%r13
+       xorq    %rcx,%r14
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    112(%rsp),%rbx
+       movq    %rcx,%r15
+       xorq    %rax,%r12
+       rorq    $6,%r14
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       xorq    %rdx,%rdi
+       rorq    $28,%r14
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rbx
+       movq    %r10,%r12
+       rorq    $5,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       rorq    $4,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       addq    120(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       rorq    $6,%r14
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       rorq    $28,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       movq    128+0(%rsp),%rdi
+       movq    %r14,%rax
+
+       addq    0(%rdi),%rax
+       leaq    128(%rsi),%rsi
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       addq    48(%rdi),%r10
+       addq    56(%rdi),%r11
+
+       cmpq    128+16(%rsp),%rsi
+
+       movq    %rax,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
+       jb      .Lloop_xop
+
+       movq    152(%rsp),%rsi
+
+       vzeroupper
+       movaps  128+32(%rsp),%xmm6
+       movaps  128+48(%rsp),%xmm7
+       movaps  128+64(%rsp),%xmm8
+       movaps  128+80(%rsp),%xmm9
+       movaps  128+96(%rsp),%xmm10
+       movaps  128+112(%rsp),%xmm11
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
+.Lepilogue_xop:
+       movq    8(%rsp),%rdi
+       movq    16(%rsp),%rsi
+       .byte   0xf3,0xc3
+
+.LSEH_end_sha512_block_data_order_xop:
+.def   sha512_block_data_order_avx;    .scl 3; .type 32;       .endef
+.p2align       6
+sha512_block_data_order_avx:
+       movq    %rdi,8(%rsp)
+       movq    %rsi,16(%rsp)
+       movq    %rsp,%rax
+.LSEH_begin_sha512_block_data_order_avx:
+       movq    %rcx,%rdi
+       movq    %rdx,%rsi
+       movq    %r8,%rdx
+
+
+.Lavx_shortcut:
+       movq    %rsp,%rax
+
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       pushq   %r15
+
+       shlq    $4,%rdx
+       subq    $256,%rsp
+       leaq    (%rsi,%rdx,8),%rdx
+       andq    $-64,%rsp
+       movq    %rdi,128+0(%rsp)
+       movq    %rsi,128+8(%rsp)
+       movq    %rdx,128+16(%rsp)
+       movq    %rax,152(%rsp)
+
+       movaps  %xmm6,128+32(%rsp)
+       movaps  %xmm7,128+48(%rsp)
+       movaps  %xmm8,128+64(%rsp)
+       movaps  %xmm9,128+80(%rsp)
+       movaps  %xmm10,128+96(%rsp)
+       movaps  %xmm11,128+112(%rsp)
+.Lprologue_avx:
+
+       vzeroupper
+       movq    0(%rdi),%rax
+       movq    8(%rdi),%rbx
+       movq    16(%rdi),%rcx
+       movq    24(%rdi),%rdx
+       movq    32(%rdi),%r8
+       movq    40(%rdi),%r9
+       movq    48(%rdi),%r10
+       movq    56(%rdi),%r11
+       jmp     .Lloop_avx
+.p2align       4
+.Lloop_avx:
+       vmovdqa K512+1280(%rip),%xmm11
+       vmovdqu 0(%rsi),%xmm0
+       leaq    K512+128(%rip),%rbp
+       vmovdqu 16(%rsi),%xmm1
+       vmovdqu 32(%rsi),%xmm2
+       vpshufb %xmm11,%xmm0,%xmm0
+       vmovdqu 48(%rsi),%xmm3
+       vpshufb %xmm11,%xmm1,%xmm1
+       vmovdqu 64(%rsi),%xmm4
+       vpshufb %xmm11,%xmm2,%xmm2
+       vmovdqu 80(%rsi),%xmm5
+       vpshufb %xmm11,%xmm3,%xmm3
+       vmovdqu 96(%rsi),%xmm6
+       vpshufb %xmm11,%xmm4,%xmm4
+       vmovdqu 112(%rsi),%xmm7
+       vpshufb %xmm11,%xmm5,%xmm5
+       vpaddq  -128(%rbp),%xmm0,%xmm8
+       vpshufb %xmm11,%xmm6,%xmm6
+       vpaddq  -96(%rbp),%xmm1,%xmm9
+       vpshufb %xmm11,%xmm7,%xmm7
+       vpaddq  -64(%rbp),%xmm2,%xmm10
+       vpaddq  -32(%rbp),%xmm3,%xmm11
+       vmovdqa %xmm8,0(%rsp)
+       vpaddq  0(%rbp),%xmm4,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       vpaddq  32(%rbp),%xmm5,%xmm9
+       vmovdqa %xmm10,32(%rsp)
+       vpaddq  64(%rbp),%xmm6,%xmm10
+       vmovdqa %xmm11,48(%rsp)
+       vpaddq  96(%rbp),%xmm7,%xmm11
+       vmovdqa %xmm8,64(%rsp)
+       movq    %rax,%r14
+       vmovdqa %xmm9,80(%rsp)
+       movq    %rbx,%rdi
+       vmovdqa %xmm10,96(%rsp)
+       xorq    %rcx,%rdi
+       vmovdqa %xmm11,112(%rsp)
+       movq    %r8,%r13
+       jmp     .Lavx_00_47
+
+.p2align       4
+.Lavx_00_47:
+       addq    $256,%rbp
+       vpalignr        $8,%xmm0,%xmm1,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rax
+       vpalignr        $8,%xmm4,%xmm5,%xmm11
+       movq    %r9,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       vpaddq  %xmm11,%xmm0,%xmm0
+       shrdq   $4,%r13,%r13
+       xorq    %rax,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    0(%rsp),%r11
+       movq    %rax,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %r10,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rbx,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm7,%xmm11
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       vpsllq  $3,%xmm7,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r11
+       vpaddq  %xmm8,%xmm0,%xmm0
+       movq    %r8,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm7,%xmm9
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %r11,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    8(%rsp),%r10
+       movq    %r11,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %r9,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm0,%xmm0
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       vpaddq  -128(%rbp),%xmm0,%xmm10
+       xorq    %rax,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       vmovdqa %xmm10,0(%rsp)
+       vpalignr        $8,%xmm1,%xmm2,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r10
+       vpalignr        $8,%xmm5,%xmm6,%xmm11
+       movq    %rdx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       vpaddq  %xmm11,%xmm1,%xmm1
+       shrdq   $4,%r13,%r13
+       xorq    %r10,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    16(%rsp),%r9
+       movq    %r10,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %r8,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r11,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm0,%xmm11
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       vpsllq  $3,%xmm0,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r9
+       vpaddq  %xmm8,%xmm1,%xmm1
+       movq    %rcx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm0,%xmm9
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %r9,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    24(%rsp),%r8
+       movq    %r9,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %rdx,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm1,%xmm1
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       vpaddq  -96(%rbp),%xmm1,%xmm10
+       xorq    %r10,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       vmovdqa %xmm10,16(%rsp)
+       vpalignr        $8,%xmm2,%xmm3,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r8
+       vpalignr        $8,%xmm6,%xmm7,%xmm11
+       movq    %rbx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       vpaddq  %xmm11,%xmm2,%xmm2
+       shrdq   $4,%r13,%r13
+       xorq    %r8,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    32(%rsp),%rdx
+       movq    %r8,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %rcx,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r9,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm1,%xmm11
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       vpsllq  $3,%xmm1,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rdx
+       vpaddq  %xmm8,%xmm2,%xmm2
+       movq    %rax,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm1,%xmm9
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %rdx,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    40(%rsp),%rcx
+       movq    %rdx,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %rbx,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm2,%xmm2
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       vpaddq  -64(%rbp),%xmm2,%xmm10
+       xorq    %r8,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       vmovdqa %xmm10,32(%rsp)
+       vpalignr        $8,%xmm3,%xmm4,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rcx
+       vpalignr        $8,%xmm7,%xmm0,%xmm11
+       movq    %r11,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       vpaddq  %xmm11,%xmm3,%xmm3
+       shrdq   $4,%r13,%r13
+       xorq    %rcx,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    48(%rsp),%rbx
+       movq    %rcx,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %rax,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rdx,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm2,%xmm11
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       vpsllq  $3,%xmm2,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rbx
+       vpaddq  %xmm8,%xmm3,%xmm3
+       movq    %r10,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm2,%xmm9
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %rbx,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    56(%rsp),%rax
+       movq    %rbx,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %r11,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm3,%xmm3
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       vpaddq  -32(%rbp),%xmm3,%xmm10
+       xorq    %rcx,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       vmovdqa %xmm10,48(%rsp)
+       vpalignr        $8,%xmm4,%xmm5,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rax
+       vpalignr        $8,%xmm0,%xmm1,%xmm11
+       movq    %r9,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       vpaddq  %xmm11,%xmm4,%xmm4
+       shrdq   $4,%r13,%r13
+       xorq    %rax,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    64(%rsp),%r11
+       movq    %rax,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %r10,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rbx,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm3,%xmm11
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       vpsllq  $3,%xmm3,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r11
+       vpaddq  %xmm8,%xmm4,%xmm4
+       movq    %r8,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm3,%xmm9
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %r11,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    72(%rsp),%r10
+       movq    %r11,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %r9,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm4,%xmm4
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       vpaddq  0(%rbp),%xmm4,%xmm10
+       xorq    %rax,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       vmovdqa %xmm10,64(%rsp)
+       vpalignr        $8,%xmm5,%xmm6,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r10
+       vpalignr        $8,%xmm1,%xmm2,%xmm11
+       movq    %rdx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       vpaddq  %xmm11,%xmm5,%xmm5
+       shrdq   $4,%r13,%r13
+       xorq    %r10,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    80(%rsp),%r9
+       movq    %r10,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %r8,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r11,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm4,%xmm11
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       vpsllq  $3,%xmm4,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r9
+       vpaddq  %xmm8,%xmm5,%xmm5
+       movq    %rcx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm4,%xmm9
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %r9,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    88(%rsp),%r8
+       movq    %r9,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %rdx,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm5,%xmm5
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       vpaddq  32(%rbp),%xmm5,%xmm10
+       xorq    %r10,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       vmovdqa %xmm10,80(%rsp)
+       vpalignr        $8,%xmm6,%xmm7,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r8
+       vpalignr        $8,%xmm2,%xmm3,%xmm11
+       movq    %rbx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       vpaddq  %xmm11,%xmm6,%xmm6
+       shrdq   $4,%r13,%r13
+       xorq    %r8,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    96(%rsp),%rdx
+       movq    %r8,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %rcx,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r9,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm5,%xmm11
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       vpsllq  $3,%xmm5,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rdx
+       vpaddq  %xmm8,%xmm6,%xmm6
+       movq    %rax,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm5,%xmm9
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %rdx,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    104(%rsp),%rcx
+       movq    %rdx,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %rbx,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm6,%xmm6
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       vpaddq  64(%rbp),%xmm6,%xmm10
+       xorq    %r8,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       vmovdqa %xmm10,96(%rsp)
+       vpalignr        $8,%xmm7,%xmm0,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rcx
+       vpalignr        $8,%xmm3,%xmm4,%xmm11
+       movq    %r11,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       vpaddq  %xmm11,%xmm7,%xmm7
+       shrdq   $4,%r13,%r13
+       xorq    %rcx,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    112(%rsp),%rbx
+       movq    %rcx,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %rax,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rdx,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm6,%xmm11
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       vpsllq  $3,%xmm6,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rbx
+       vpaddq  %xmm8,%xmm7,%xmm7
+       movq    %r10,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm6,%xmm9
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %rbx,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    120(%rsp),%rax
+       movq    %rbx,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %r11,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm7,%xmm7
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       vpaddq  96(%rbp),%xmm7,%xmm10
+       xorq    %rcx,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       vmovdqa %xmm10,112(%rsp)
+       cmpb    $0,135(%rbp)
+       jne     .Lavx_00_47
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rax
+       movq    %r9,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rax,%r14
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    0(%rsp),%r11
+       movq    %rax,%r15
+       xorq    %r10,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       xorq    %rbx,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r11
+       movq    %r8,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       addq    8(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r10
+       movq    %rdx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r10,%r14
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    16(%rsp),%r9
+       movq    %r10,%r15
+       xorq    %r8,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       xorq    %r11,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r9
+       movq    %rcx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       addq    24(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r8
+       movq    %rbx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r8,%r14
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    32(%rsp),%rdx
+       movq    %r8,%r15
+       xorq    %rcx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       xorq    %r9,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rdx
+       movq    %rax,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       addq    40(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rcx
+       movq    %r11,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rcx,%r14
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    48(%rsp),%rbx
+       movq    %rcx,%r15
+       xorq    %rax,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       xorq    %rdx,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rbx
+       movq    %r10,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       addq    56(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rax
+       movq    %r9,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rax,%r14
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    64(%rsp),%r11
+       movq    %rax,%r15
+       xorq    %r10,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       xorq    %rbx,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r11
+       movq    %r8,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       addq    72(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r10
+       movq    %rdx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r10,%r14
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    80(%rsp),%r9
+       movq    %r10,%r15
+       xorq    %r8,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       xorq    %r11,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r9
+       movq    %rcx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       addq    88(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r8
+       movq    %rbx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r8,%r14
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    96(%rsp),%rdx
+       movq    %r8,%r15
+       xorq    %rcx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       xorq    %r9,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rdx
+       movq    %rax,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       addq    104(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rcx
+       movq    %r11,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rcx,%r14
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    112(%rsp),%rbx
+       movq    %rcx,%r15
+       xorq    %rax,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       xorq    %rdx,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rbx
+       movq    %r10,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       addq    120(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       movq    128+0(%rsp),%rdi
+       movq    %r14,%rax
+
+       addq    0(%rdi),%rax
+       leaq    128(%rsi),%rsi
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       addq    48(%rdi),%r10
+       addq    56(%rdi),%r11
+
+       cmpq    128+16(%rsp),%rsi
+
+       movq    %rax,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
+       jb      .Lloop_avx
+
+       movq    152(%rsp),%rsi
+
+       vzeroupper
+       movaps  128+32(%rsp),%xmm6
+       movaps  128+48(%rsp),%xmm7
+       movaps  128+64(%rsp),%xmm8
+       movaps  128+80(%rsp),%xmm9
+       movaps  128+96(%rsp),%xmm10
+       movaps  128+112(%rsp),%xmm11
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
+.Lepilogue_avx:
+       movq    8(%rsp),%rdi
+       movq    16(%rsp),%rsi
+       .byte   0xf3,0xc3
+
+.LSEH_end_sha512_block_data_order_avx:
+.def   sha512_block_data_order_avx2;   .scl 3; .type 32;       .endef
+.p2align       6
+sha512_block_data_order_avx2:
+       movq    %rdi,8(%rsp)
+       movq    %rsi,16(%rsp)
+       movq    %rsp,%rax
+.LSEH_begin_sha512_block_data_order_avx2:
+       movq    %rcx,%rdi
+       movq    %rdx,%rsi
+       movq    %r8,%rdx
+
+
+.Lavx2_shortcut:
+       movq    %rsp,%rax
+
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       pushq   %r15
+
+       subq    $1408,%rsp
+       shlq    $4,%rdx
+       andq    $-2048,%rsp
+       leaq    (%rsi,%rdx,8),%rdx
+       addq    $1152,%rsp
+       movq    %rdi,128+0(%rsp)
+       movq    %rsi,128+8(%rsp)
+       movq    %rdx,128+16(%rsp)
+       movq    %rax,152(%rsp)
+
+       movaps  %xmm6,128+32(%rsp)
+       movaps  %xmm7,128+48(%rsp)
+       movaps  %xmm8,128+64(%rsp)
+       movaps  %xmm9,128+80(%rsp)
+       movaps  %xmm10,128+96(%rsp)
+       movaps  %xmm11,128+112(%rsp)
+.Lprologue_avx2:
+
+       vzeroupper
+       subq    $-128,%rsi
+       movq    0(%rdi),%rax
+       movq    %rsi,%r12
+       movq    8(%rdi),%rbx
+       cmpq    %rdx,%rsi
+       movq    16(%rdi),%rcx
+       cmoveq  %rsp,%r12
+       movq    24(%rdi),%rdx
+       movq    32(%rdi),%r8
+       movq    40(%rdi),%r9
+       movq    48(%rdi),%r10
+       movq    56(%rdi),%r11
+       jmp     .Loop_avx2
+.p2align       4
+.Loop_avx2:
+       vmovdqu -128(%rsi),%xmm0
+       vmovdqu -128+16(%rsi),%xmm1
+       vmovdqu -128+32(%rsi),%xmm2
+       leaq    K512+128(%rip),%rbp
+       vmovdqu -128+48(%rsi),%xmm3
+       vmovdqu -128+64(%rsi),%xmm4
+       vmovdqu -128+80(%rsi),%xmm5
+       vmovdqu -128+96(%rsi),%xmm6
+       vmovdqu -128+112(%rsi),%xmm7
+
+       vmovdqa 1152(%rbp),%ymm10
+       vinserti128     $1,(%r12),%ymm0,%ymm0
+       vinserti128     $1,16(%r12),%ymm1,%ymm1
+       vpshufb %ymm10,%ymm0,%ymm0
+       vinserti128     $1,32(%r12),%ymm2,%ymm2
+       vpshufb %ymm10,%ymm1,%ymm1
+       vinserti128     $1,48(%r12),%ymm3,%ymm3
+       vpshufb %ymm10,%ymm2,%ymm2
+       vinserti128     $1,64(%r12),%ymm4,%ymm4
+       vpshufb %ymm10,%ymm3,%ymm3
+       vinserti128     $1,80(%r12),%ymm5,%ymm5
+       vpshufb %ymm10,%ymm4,%ymm4
+       vinserti128     $1,96(%r12),%ymm6,%ymm6
+       vpshufb %ymm10,%ymm5,%ymm5
+       vinserti128     $1,112(%r12),%ymm7,%ymm7
+
+       vpaddq  -128(%rbp),%ymm0,%ymm8
+       vpshufb %ymm10,%ymm6,%ymm6
+       vpaddq  -96(%rbp),%ymm1,%ymm9
+       vpshufb %ymm10,%ymm7,%ymm7
+       vpaddq  -64(%rbp),%ymm2,%ymm10
+       vpaddq  -32(%rbp),%ymm3,%ymm11
+       vmovdqa %ymm8,0(%rsp)
+       vpaddq  0(%rbp),%ymm4,%ymm8
+       vmovdqa %ymm9,32(%rsp)
+       vpaddq  32(%rbp),%ymm5,%ymm9
+       vmovdqa %ymm10,64(%rsp)
+       vpaddq  64(%rbp),%ymm6,%ymm10
+       vmovdqa %ymm11,96(%rsp)
+       leaq    -128(%rsp),%rsp
+       vpaddq  96(%rbp),%ymm7,%ymm11
+       vmovdqa %ymm8,0(%rsp)
+       xorq    %r14,%r14
+       vmovdqa %ymm9,32(%rsp)
+       movq    %rbx,%rdi
+       vmovdqa %ymm10,64(%rsp)
+       xorq    %rcx,%rdi
+       vmovdqa %ymm11,96(%rsp)
+       movq    %r9,%r12
+       addq    $32*8,%rbp
+       jmp     .Lavx2_00_47
+
+.p2align       4
+.Lavx2_00_47:
+       leaq    -128(%rsp),%rsp
+       vpalignr        $8,%ymm0,%ymm1,%ymm8
+       addq    0+256(%rsp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       vpalignr        $8,%ymm4,%ymm5,%ymm11
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       vpaddq  %ymm11,%ymm0,%ymm0
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       vpsrlq  $6,%ymm7,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       vpsllq  $3,%ymm7,%ymm10
+       vpaddq  %ymm8,%ymm0,%ymm0
+       addq    8+256(%rsp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       vpsrlq  $19,%ymm7,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       vpaddq  %ymm11,%ymm0,%ymm0
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       vpaddq  -128(%rbp),%ymm0,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       vmovdqa %ymm10,0(%rsp)
+       vpalignr        $8,%ymm1,%ymm2,%ymm8
+       addq    32+256(%rsp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       vpalignr        $8,%ymm5,%ymm6,%ymm11
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       vpaddq  %ymm11,%ymm1,%ymm1
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       vpsrlq  $6,%ymm0,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       vpsllq  $3,%ymm0,%ymm10
+       vpaddq  %ymm8,%ymm1,%ymm1
+       addq    40+256(%rsp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       vpsrlq  $19,%ymm0,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       vpaddq  %ymm11,%ymm1,%ymm1
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       vpaddq  -96(%rbp),%ymm1,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       vmovdqa %ymm10,32(%rsp)
+       vpalignr        $8,%ymm2,%ymm3,%ymm8
+       addq    64+256(%rsp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       vpalignr        $8,%ymm6,%ymm7,%ymm11
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       vpaddq  %ymm11,%ymm2,%ymm2
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       vpsrlq  $6,%ymm1,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       vpsllq  $3,%ymm1,%ymm10
+       vpaddq  %ymm8,%ymm2,%ymm2
+       addq    72+256(%rsp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       vpsrlq  $19,%ymm1,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       vpaddq  %ymm11,%ymm2,%ymm2
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       vpaddq  -64(%rbp),%ymm2,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       vmovdqa %ymm10,64(%rsp)
+       vpalignr        $8,%ymm3,%ymm4,%ymm8
+       addq    96+256(%rsp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       vpalignr        $8,%ymm7,%ymm0,%ymm11
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       vpaddq  %ymm11,%ymm3,%ymm3
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       vpsrlq  $6,%ymm2,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       vpsllq  $3,%ymm2,%ymm10
+       vpaddq  %ymm8,%ymm3,%ymm3
+       addq    104+256(%rsp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       vpsrlq  $19,%ymm2,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       vpaddq  %ymm11,%ymm3,%ymm3
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       vpaddq  -32(%rbp),%ymm3,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       vmovdqa %ymm10,96(%rsp)
+       leaq    -128(%rsp),%rsp
+       vpalignr        $8,%ymm4,%ymm5,%ymm8
+       addq    0+256(%rsp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       vpalignr        $8,%ymm0,%ymm1,%ymm11
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       vpaddq  %ymm11,%ymm4,%ymm4
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       vpsrlq  $6,%ymm3,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       vpsllq  $3,%ymm3,%ymm10
+       vpaddq  %ymm8,%ymm4,%ymm4
+       addq    8+256(%rsp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       vpsrlq  $19,%ymm3,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       vpaddq  %ymm11,%ymm4,%ymm4
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       vpaddq  0(%rbp),%ymm4,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       vmovdqa %ymm10,0(%rsp)
+       vpalignr        $8,%ymm5,%ymm6,%ymm8
+       addq    32+256(%rsp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       vpalignr        $8,%ymm1,%ymm2,%ymm11
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       vpaddq  %ymm11,%ymm5,%ymm5
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       vpsrlq  $6,%ymm4,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       vpsllq  $3,%ymm4,%ymm10
+       vpaddq  %ymm8,%ymm5,%ymm5
+       addq    40+256(%rsp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       vpsrlq  $19,%ymm4,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       vpaddq  %ymm11,%ymm5,%ymm5
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       vpaddq  32(%rbp),%ymm5,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       vmovdqa %ymm10,32(%rsp)
+       vpalignr        $8,%ymm6,%ymm7,%ymm8
+       addq    64+256(%rsp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       vpalignr        $8,%ymm2,%ymm3,%ymm11
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       vpaddq  %ymm11,%ymm6,%ymm6
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       vpsrlq  $6,%ymm5,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       vpsllq  $3,%ymm5,%ymm10
+       vpaddq  %ymm8,%ymm6,%ymm6
+       addq    72+256(%rsp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       vpsrlq  $19,%ymm5,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       vpaddq  %ymm11,%ymm6,%ymm6
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       vpaddq  64(%rbp),%ymm6,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       vmovdqa %ymm10,64(%rsp)
+       vpalignr        $8,%ymm7,%ymm0,%ymm8
+       addq    96+256(%rsp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       vpalignr        $8,%ymm3,%ymm4,%ymm11
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       vpaddq  %ymm11,%ymm7,%ymm7
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       vpsrlq  $6,%ymm6,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       vpsllq  $3,%ymm6,%ymm10
+       vpaddq  %ymm8,%ymm7,%ymm7
+       addq    104+256(%rsp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       vpsrlq  $19,%ymm6,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       vpaddq  %ymm11,%ymm7,%ymm7
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       vpaddq  96(%rbp),%ymm7,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       vmovdqa %ymm10,96(%rsp)
+       leaq    256(%rbp),%rbp
+       cmpb    $0,-121(%rbp)
+       jne     .Lavx2_00_47
+       addq    0+128(%rsp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       addq    8+128(%rsp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       addq    32+128(%rsp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       addq    40+128(%rsp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       addq    64+128(%rsp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       addq    72+128(%rsp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       addq    96+128(%rsp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       addq    104+128(%rsp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       addq    0(%rsp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       addq    8(%rsp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       addq    32(%rsp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       addq    40(%rsp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       addq    64(%rsp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       addq    72(%rsp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       addq    96(%rsp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       addq    104(%rsp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       movq    1280(%rsp),%rdi
+       addq    %r14,%rax
+
+       leaq    1152(%rsp),%rbp
+
+       addq    0(%rdi),%rax
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       addq    48(%rdi),%r10
+       addq    56(%rdi),%r11
+
+       movq    %rax,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
+
+       cmpq    144(%rbp),%rsi
+       je      .Ldone_avx2
+
+       xorq    %r14,%r14
+       movq    %rbx,%rdi
+       xorq    %rcx,%rdi
+       movq    %r9,%r12
+       jmp     .Lower_avx2
+.p2align       4
+.Lower_avx2:
+       addq    0+16(%rbp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       addq    8+16(%rbp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       addq    32+16(%rbp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       addq    40+16(%rbp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       addq    64+16(%rbp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       addq    72+16(%rbp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       addq    96+16(%rbp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       addq    104+16(%rbp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       leaq    -128(%rbp),%rbp
+       cmpq    %rsp,%rbp
+       jae     .Lower_avx2
+
+       movq    1280(%rsp),%rdi
+       addq    %r14,%rax
+
+       leaq    1152(%rsp),%rsp
+
+       addq    0(%rdi),%rax
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       leaq    256(%rsi),%rsi
+       addq    48(%rdi),%r10
+       movq    %rsi,%r12
+       addq    56(%rdi),%r11
+       cmpq    128+16(%rsp),%rsi
+
+       movq    %rax,0(%rdi)
+       cmoveq  %rsp,%r12
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
+
+       jbe     .Loop_avx2
+       leaq    (%rsp),%rbp
+
+.Ldone_avx2:
+       leaq    (%rbp),%rsp
+       movq    152(%rsp),%rsi
+
+       vzeroupper
+       movaps  128+32(%rsp),%xmm6
+       movaps  128+48(%rsp),%xmm7
+       movaps  128+64(%rsp),%xmm8
+       movaps  128+80(%rsp),%xmm9
+       movaps  128+96(%rsp),%xmm10
+       movaps  128+112(%rsp),%xmm11
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
+.Lepilogue_avx2:
        movq    8(%rsp),%rdi
        movq    16(%rsp),%rsi
        .byte   0xf3,0xc3
-.LSEH_end_sha256_block_data_order_ssse3:
+
+.LSEH_end_sha512_block_data_order_avx2:
 
 .def   se_handler;     .scl 3; .type 32;       .endef
 .p2align       4
@@ -2946,9 +5582,15 @@ se_handler:
        leaq    (%rsi,%r10,1),%r10
        cmpq    %r10,%rbx
        jae     .Lin_prologue
+       leaq    .Lavx2_shortcut(%rip),%r10
+       cmpq    %r10,%rbx
+       jb      .Lnot_in_avx2
+
+       andq    $-2048,%rax
+       addq    $1152,%rax
+.Lnot_in_avx2:
        movq    %rax,%rsi
-       movq    64+24(%rax),%rax
-       leaq    48(%rax),%rax
+       movq    128+24(%rax),%rax
 
        movq    -8(%rax),%rbx
        movq    -16(%rax),%rbp
@@ -2965,12 +5607,12 @@ se_handler:
 
        leaq    .Lepilogue(%rip),%r10
        cmpq    %r10,%rbx
-       jb      .Lin_prologue           
+       jb      .Lin_prologue
 
-       leaq    64+32(%rsi),%rsi
+       leaq    128+32(%rsi),%rsi
        leaq    512(%r8),%rdi
-       movl    $8,%ecx
-.long  0xa548f3fc              
+       movl    $12,%ecx
+.long  0xa548f3fc
 
 .Lin_prologue:
        movq    8(%rax),%rdi
@@ -2982,7 +5624,7 @@ se_handler:
        movq    40(%r9),%rdi
        movq    %r8,%rsi
        movl    $154,%ecx
-.long  0xa548f3fc              
+.long  0xa548f3fc
 
        movq    %r9,%rsi
        xorq    %rcx,%rcx
@@ -3011,23 +5653,36 @@ se_handler:
        popq    %rsi
        .byte   0xf3,0xc3
 
-
 .section       .pdata
 .p2align       2
-.rva   .LSEH_begin_sha256_block_data_order
-.rva   .LSEH_end_sha256_block_data_order
-.rva   .LSEH_info_sha256_block_data_order
-.rva   .LSEH_begin_sha256_block_data_order_ssse3
-.rva   .LSEH_end_sha256_block_data_order_ssse3
-.rva   .LSEH_info_sha256_block_data_order_ssse3
+.rva   .LSEH_begin_sha512_block_data_order
+.rva   .LSEH_end_sha512_block_data_order
+.rva   .LSEH_info_sha512_block_data_order
+.rva   .LSEH_begin_sha512_block_data_order_xop
+.rva   .LSEH_end_sha512_block_data_order_xop
+.rva   .LSEH_info_sha512_block_data_order_xop
+.rva   .LSEH_begin_sha512_block_data_order_avx
+.rva   .LSEH_end_sha512_block_data_order_avx
+.rva   .LSEH_info_sha512_block_data_order_avx
+.rva   .LSEH_begin_sha512_block_data_order_avx2
+.rva   .LSEH_end_sha512_block_data_order_avx2
+.rva   .LSEH_info_sha512_block_data_order_avx2
 .section       .xdata
 .p2align       3
-.LSEH_info_sha256_block_data_order:
+.LSEH_info_sha512_block_data_order:
+.byte  9,0,0,0
+.rva   se_handler
+.rva   .Lprologue,.Lepilogue
+.LSEH_info_sha512_block_data_order_xop:
+.byte  9,0,0,0
+.rva   se_handler
+.rva   .Lprologue_xop,.Lepilogue_xop
+.LSEH_info_sha512_block_data_order_avx:
 .byte  9,0,0,0
 .rva   se_handler
-.rva   .Lprologue,.Lepilogue                   
-.LSEH_info_sha256_block_data_order_ssse3:
+.rva   .Lprologue_avx,.Lepilogue_avx
+.LSEH_info_sha512_block_data_order_avx2:
 .byte  9,0,0,0
 .rva   se_handler
-.rva   .Lprologue_ssse3,.Lepilogue_ssse3       
+.rva   .Lprologue_avx2,.Lepilogue_avx2
 
index 2b677952d17464a94d6691877dd4b0e94244b07c..265e28a7efaa6dfd36b99ef08f85ff8908cf122c 100644 (file)
@@ -5,12 +5,11 @@
 ## By Mike Hamburg (Stanford University), 2009
 ## Public domain.
 ##
-## For details see https://shiftleft.org/papers/vector_aes/ and
-## https://crypto.stanford.edu/vpaes/.
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
 #
 # *** This file is auto-generated ***
 #
-.file  "vpaes-x86.s"
 .text
 .align 64
 .L_vpaes_consts:
index d086050e37079fd8752c7fce671cd479e680a19b..ea1216baf7e4801a7ca5374fcb76e71bb04a31c5 100644 (file)
@@ -5,8 +5,8 @@
 ## By Mike Hamburg (Stanford University), 2009
 ## Public domain.
 ##
-## For details see https://shiftleft.org/papers/vector_aes/ and
-## https://crypto.stanford.edu/vpaes/.
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
 #
 # *** This file is auto-generated ***
 #
@@ -30,6 +30,7 @@
 .type  _vpaes_encrypt_core,@function
 .align 16
 _vpaes_encrypt_core:
+.cfi_startproc 
        movq    %rdx,%r9
        movq    $16,%r11
        movl    240(%rdx),%eax
@@ -110,6 +111,7 @@ _vpaes_encrypt_core:
        pxor    %xmm4,%xmm0
 .byte  102,15,56,0,193
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _vpaes_encrypt_core,.-_vpaes_encrypt_core
 
 
@@ -120,6 +122,7 @@ _vpaes_encrypt_core:
 .type  _vpaes_decrypt_core,@function
 .align 16
 _vpaes_decrypt_core:
+.cfi_startproc 
        movq    %rdx,%r9
        movl    240(%rdx),%eax
        movdqa  %xmm9,%xmm1
@@ -216,6 +219,7 @@ _vpaes_decrypt_core:
        pxor    %xmm4,%xmm0
 .byte  102,15,56,0,194
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _vpaes_decrypt_core,.-_vpaes_decrypt_core
 
 
@@ -226,6 +230,7 @@ _vpaes_decrypt_core:
 .type  _vpaes_schedule_core,@function
 .align 16
 _vpaes_schedule_core:
+.cfi_startproc 
 
 
 
@@ -392,6 +397,7 @@ _vpaes_schedule_core:
        pxor    %xmm6,%xmm6
        pxor    %xmm7,%xmm7
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _vpaes_schedule_core,.-_vpaes_schedule_core
 
 
@@ -411,6 +417,7 @@ _vpaes_schedule_core:
 .type  _vpaes_schedule_192_smear,@function
 .align 16
 _vpaes_schedule_192_smear:
+.cfi_startproc 
        pshufd  $0x80,%xmm6,%xmm1
        pshufd  $0xFE,%xmm7,%xmm0
        pxor    %xmm1,%xmm6
@@ -419,6 +426,7 @@ _vpaes_schedule_192_smear:
        movdqa  %xmm6,%xmm0
        movhlps %xmm1,%xmm6
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
 
 
@@ -442,6 +450,7 @@ _vpaes_schedule_192_smear:
 .type  _vpaes_schedule_round,@function
 .align 16
 _vpaes_schedule_round:
+.cfi_startproc 
 
        pxor    %xmm1,%xmm1
 .byte  102,65,15,58,15,200,15
@@ -495,6 +504,7 @@ _vpaes_schedule_low_round:
        pxor    %xmm7,%xmm0
        movdqa  %xmm0,%xmm7
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _vpaes_schedule_round,.-_vpaes_schedule_round
 
 
@@ -509,6 +519,7 @@ _vpaes_schedule_low_round:
 .type  _vpaes_schedule_transform,@function
 .align 16
 _vpaes_schedule_transform:
+.cfi_startproc 
        movdqa  %xmm9,%xmm1
        pandn   %xmm0,%xmm1
        psrld   $4,%xmm1
@@ -519,6 +530,7 @@ _vpaes_schedule_transform:
 .byte  102,15,56,0,193
        pxor    %xmm2,%xmm0
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _vpaes_schedule_transform,.-_vpaes_schedule_transform
 
 
@@ -547,6 +559,7 @@ _vpaes_schedule_transform:
 .type  _vpaes_schedule_mangle,@function
 .align 16
 _vpaes_schedule_mangle:
+.cfi_startproc 
        movdqa  %xmm0,%xmm4
        movdqa  .Lk_mc_forward(%rip),%xmm5
        testq   %rcx,%rcx
@@ -611,6 +624,7 @@ _vpaes_schedule_mangle:
        andq    $0x30,%r8
        movdqu  %xmm3,(%rdx)
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
 
 
@@ -620,6 +634,7 @@ _vpaes_schedule_mangle:
 .type  vpaes_set_encrypt_key,@function
 .align 16
 vpaes_set_encrypt_key:
+.cfi_startproc 
        movl    %esi,%eax
        shrl    $5,%eax
        addl    $5,%eax
@@ -630,12 +645,14 @@ vpaes_set_encrypt_key:
        call    _vpaes_schedule_core
        xorl    %eax,%eax
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
 
 .globl vpaes_set_decrypt_key
 .type  vpaes_set_decrypt_key,@function
 .align 16
 vpaes_set_decrypt_key:
+.cfi_startproc 
        movl    %esi,%eax
        shrl    $5,%eax
        addl    $5,%eax
@@ -651,33 +668,39 @@ vpaes_set_decrypt_key:
        call    _vpaes_schedule_core
        xorl    %eax,%eax
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
 
 .globl vpaes_encrypt
 .type  vpaes_encrypt,@function
 .align 16
 vpaes_encrypt:
+.cfi_startproc 
        movdqu  (%rdi),%xmm0
        call    _vpaes_preheat
        call    _vpaes_encrypt_core
        movdqu  %xmm0,(%rsi)
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  vpaes_encrypt,.-vpaes_encrypt
 
 .globl vpaes_decrypt
 .type  vpaes_decrypt,@function
 .align 16
 vpaes_decrypt:
+.cfi_startproc 
        movdqu  (%rdi),%xmm0
        call    _vpaes_preheat
        call    _vpaes_decrypt_core
        movdqu  %xmm0,(%rsi)
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  vpaes_decrypt,.-vpaes_decrypt
 .globl vpaes_cbc_encrypt
 .type  vpaes_cbc_encrypt,@function
 .align 16
 vpaes_cbc_encrypt:
+.cfi_startproc 
        xchgq   %rcx,%rdx
        subq    $16,%rcx
        jc      .Lcbc_abort
@@ -713,6 +736,7 @@ vpaes_cbc_encrypt:
        movdqu  %xmm6,(%r8)
 .Lcbc_abort:
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
 
 
@@ -723,6 +747,7 @@ vpaes_cbc_encrypt:
 .type  _vpaes_preheat,@function
 .align 16
 _vpaes_preheat:
+.cfi_startproc 
        leaq    .Lk_s0F(%rip),%r10
        movdqa  -32(%r10),%xmm10
        movdqa  -16(%r10),%xmm11
@@ -732,6 +757,7 @@ _vpaes_preheat:
        movdqa  80(%r10),%xmm15
        movdqa  96(%r10),%xmm14
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _vpaes_preheat,.-_vpaes_preheat
 
 
index 07f177d8d4f6844f8a84192f1ab6a1debae1d1e9..e26d18d69fa1bbeb24b1503d0c3e8127a32f49a4 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -354,17 +354,25 @@ _aesni_ctr32_ghash_6x:
 .type  aesni_gcm_decrypt,@function
 .align 32
 aesni_gcm_decrypt:
+.cfi_startproc 
        xorq    %r10,%r10
        cmpq    $0x60,%rdx
        jb      .Lgcm_dec_abort
 
        leaq    (%rsp),%rax
+.cfi_def_cfa_register  %rax
        pushq   %rbx
+.cfi_offset    %rbx,-16
        pushq   %rbp
+.cfi_offset    %rbp,-24
        pushq   %r12
+.cfi_offset    %r12,-32
        pushq   %r13
+.cfi_offset    %r13,-40
        pushq   %r14
+.cfi_offset    %r14,-48
        pushq   %r15
+.cfi_offset    %r15,-56
        vzeroupper
 
        vmovdqu (%r8),%xmm1
@@ -426,15 +434,23 @@ aesni_gcm_decrypt:
 
        vzeroupper
        movq    -48(%rax),%r15
+.cfi_restore   %r15
        movq    -40(%rax),%r14
+.cfi_restore   %r14
        movq    -32(%rax),%r13
+.cfi_restore   %r13
        movq    -24(%rax),%r12
+.cfi_restore   %r12
        movq    -16(%rax),%rbp
+.cfi_restore   %rbp
        movq    -8(%rax),%rbx
+.cfi_restore   %rbx
        leaq    (%rax),%rsp
+.cfi_def_cfa_register  %rsp
 .Lgcm_dec_abort:
        movq    %r10,%rax
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  aesni_gcm_decrypt,.-aesni_gcm_decrypt
 .type  _aesni_ctr32_6x,@function
 .align 32
@@ -531,17 +547,25 @@ _aesni_ctr32_6x:
 .type  aesni_gcm_encrypt,@function
 .align 32
 aesni_gcm_encrypt:
+.cfi_startproc 
        xorq    %r10,%r10
        cmpq    $288,%rdx
        jb      .Lgcm_enc_abort
 
        leaq    (%rsp),%rax
+.cfi_def_cfa_register  %rax
        pushq   %rbx
+.cfi_offset    %rbx,-16
        pushq   %rbp
+.cfi_offset    %rbp,-24
        pushq   %r12
+.cfi_offset    %r12,-32
        pushq   %r13
+.cfi_offset    %r13,-40
        pushq   %r14
+.cfi_offset    %r14,-48
        pushq   %r15
+.cfi_offset    %r15,-56
        vzeroupper
 
        vmovdqu (%r8),%xmm1
@@ -767,15 +791,23 @@ aesni_gcm_encrypt:
 
        vzeroupper
        movq    -48(%rax),%r15
+.cfi_restore   %r15
        movq    -40(%rax),%r14
+.cfi_restore   %r14
        movq    -32(%rax),%r13
+.cfi_restore   %r13
        movq    -24(%rax),%r12
+.cfi_restore   %r12
        movq    -16(%rax),%rbp
+.cfi_restore   %rbp
        movq    -8(%rax),%rbx
+.cfi_restore   %rbx
        leaq    (%rax),%rsp
+.cfi_def_cfa_register  %rsp
 .Lgcm_enc_abort:
        movq    %r10,%rax
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  aesni_gcm_encrypt,.-aesni_gcm_encrypt
 .align 64
 .Lbswap_mask:
index 5d70f2568f309b5eea0af9fe31b788703627f08f..aaf0bab6350344bba9d4b733187fdff5ffc3113a 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,6 @@
 #
 # *** This file is auto-generated ***
 #
-.file  "devel/perlasm/aesni-x86.s"
 .text
 .globl aesni_encrypt
 .type  aesni_encrypt,@function
@@ -60,7 +59,10 @@ aesni_encrypt:
        leal    16(%edx),%edx
        jnz     .L000enc1_loop_1
 .byte  102,15,56,221,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%eax)
+       pxor    %xmm2,%xmm2
        ret
 .size  aesni_encrypt,.-.L_aesni_encrypt_begin
 .globl aesni_decrypt
@@ -84,32 +86,90 @@ aesni_decrypt:
        leal    16(%edx),%edx
        jnz     .L001dec1_loop_2
 .byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%eax)
+       pxor    %xmm2,%xmm2
        ret
 .size  aesni_decrypt,.-.L_aesni_decrypt_begin
+.type  _aesni_encrypt2,@function
+.align 16
+_aesni_encrypt2:
+       movups  (%edx),%xmm0
+       shll    $4,%ecx
+       movups  16(%edx),%xmm1
+       xorps   %xmm0,%xmm2
+       pxor    %xmm0,%xmm3
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+       addl    $16,%ecx
+.L002enc2_loop:
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L002enc2_loop
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,221,208
+.byte  102,15,56,221,216
+       ret
+.size  _aesni_encrypt2,.-_aesni_encrypt2
+.type  _aesni_decrypt2,@function
+.align 16
+_aesni_decrypt2:
+       movups  (%edx),%xmm0
+       shll    $4,%ecx
+       movups  16(%edx),%xmm1
+       xorps   %xmm0,%xmm2
+       pxor    %xmm0,%xmm3
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+       addl    $16,%ecx
+.L003dec2_loop:
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L003dec2_loop
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,223,208
+.byte  102,15,56,223,216
+       ret
+.size  _aesni_decrypt2,.-_aesni_decrypt2
 .type  _aesni_encrypt3,@function
 .align 16
 _aesni_encrypt3:
        movups  (%edx),%xmm0
-       shrl    $1,%ecx
+       shll    $4,%ecx
        movups  16(%edx),%xmm1
-       leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
        pxor    %xmm0,%xmm4
-       movups  (%edx),%xmm0
-.L002enc3_loop:
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+       addl    $16,%ecx
+.L004enc3_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       decl    %ecx
 .byte  102,15,56,220,225
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       leal    32(%edx),%edx
 .byte  102,15,56,220,224
-       movups  (%edx),%xmm0
-       jnz     .L002enc3_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L004enc3_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
@@ -122,25 +182,26 @@ _aesni_encrypt3:
 .align 16
 _aesni_decrypt3:
        movups  (%edx),%xmm0
-       shrl    $1,%ecx
+       shll    $4,%ecx
        movups  16(%edx),%xmm1
-       leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
        pxor    %xmm0,%xmm4
-       movups  (%edx),%xmm0
-.L003dec3_loop:
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+       addl    $16,%ecx
+.L005dec3_loop:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
-       decl    %ecx
 .byte  102,15,56,222,225
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,222,208
 .byte  102,15,56,222,216
-       leal    32(%edx),%edx
 .byte  102,15,56,222,224
-       movups  (%edx),%xmm0
-       jnz     .L003dec3_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L005dec3_loop
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
@@ -154,27 +215,29 @@ _aesni_decrypt3:
 _aesni_encrypt4:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
-       shrl    $1,%ecx
-       leal    32(%edx),%edx
+       shll    $4,%ecx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
        pxor    %xmm0,%xmm4
        pxor    %xmm0,%xmm5
-       movups  (%edx),%xmm0
-.L004enc4_loop:
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+.byte  15,31,64,0
+       addl    $16,%ecx
+.L006enc4_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       decl    %ecx
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       leal    32(%edx),%edx
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
-       movups  (%edx),%xmm0
-       jnz     .L004enc4_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L006enc4_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
@@ -190,27 +253,29 @@ _aesni_encrypt4:
 _aesni_decrypt4:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
-       shrl    $1,%ecx
-       leal    32(%edx),%edx
+       shll    $4,%ecx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
        pxor    %xmm0,%xmm4
        pxor    %xmm0,%xmm5
-       movups  (%edx),%xmm0
-.L005dec4_loop:
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+.byte  15,31,64,0
+       addl    $16,%ecx
+.L007dec4_loop:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
-       decl    %ecx
 .byte  102,15,56,222,225
 .byte  102,15,56,222,233
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,222,208
 .byte  102,15,56,222,216
-       leal    32(%edx),%edx
 .byte  102,15,56,222,224
 .byte  102,15,56,222,232
-       movups  (%edx),%xmm0
-       jnz     .L005dec4_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L007dec4_loop
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
@@ -225,45 +290,42 @@ _aesni_decrypt4:
 .align 16
 _aesni_encrypt6:
        movups  (%edx),%xmm0
-       shrl    $1,%ecx
+       shll    $4,%ecx
        movups  16(%edx),%xmm1
-       leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
-.byte  102,15,56,220,209
        pxor    %xmm0,%xmm4
-.byte  102,15,56,220,217
+.byte  102,15,56,220,209
        pxor    %xmm0,%xmm5
-       decl    %ecx
-.byte  102,15,56,220,225
        pxor    %xmm0,%xmm6
-.byte  102,15,56,220,233
+.byte  102,15,56,220,217
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+.byte  102,15,56,220,225
        pxor    %xmm0,%xmm7
-.byte  102,15,56,220,241
-       movups  (%edx),%xmm0
-.byte  102,15,56,220,249
-       jmp     .L_aesni_encrypt6_enter
+       movups  (%edx,%ecx,1),%xmm0
+       addl    $16,%ecx
+       jmp     .L008_aesni_encrypt6_inner
 .align 16
-.L006enc6_loop:
+.L009enc6_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       decl    %ecx
 .byte  102,15,56,220,225
+.L008_aesni_encrypt6_inner:
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
-.align 16
 .L_aesni_encrypt6_enter:
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       leal    32(%edx),%edx
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
 .byte  102,15,56,220,240
 .byte  102,15,56,220,248
-       movups  (%edx),%xmm0
-       jnz     .L006enc6_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L009enc6_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
@@ -282,45 +344,42 @@ _aesni_encrypt6:
 .align 16
 _aesni_decrypt6:
        movups  (%edx),%xmm0
-       shrl    $1,%ecx
+       shll    $4,%ecx
        movups  16(%edx),%xmm1
-       leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
-.byte  102,15,56,222,209
        pxor    %xmm0,%xmm4
-.byte  102,15,56,222,217
+.byte  102,15,56,222,209
        pxor    %xmm0,%xmm5
-       decl    %ecx
-.byte  102,15,56,222,225
        pxor    %xmm0,%xmm6
-.byte  102,15,56,222,233
+.byte  102,15,56,222,217
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+.byte  102,15,56,222,225
        pxor    %xmm0,%xmm7
-.byte  102,15,56,222,241
-       movups  (%edx),%xmm0
-.byte  102,15,56,222,249
-       jmp     .L_aesni_decrypt6_enter
+       movups  (%edx,%ecx,1),%xmm0
+       addl    $16,%ecx
+       jmp     .L010_aesni_decrypt6_inner
 .align 16
-.L007dec6_loop:
+.L011dec6_loop:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
-       decl    %ecx
 .byte  102,15,56,222,225
+.L010_aesni_decrypt6_inner:
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
 .byte  102,15,56,222,249
-.align 16
 .L_aesni_decrypt6_enter:
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,222,208
 .byte  102,15,56,222,216
-       leal    32(%edx),%edx
 .byte  102,15,56,222,224
 .byte  102,15,56,222,232
 .byte  102,15,56,222,240
 .byte  102,15,56,222,248
-       movups  (%edx),%xmm0
-       jnz     .L007dec6_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L011dec6_loop
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
@@ -350,14 +409,14 @@ aesni_ecb_encrypt:
        movl    32(%esp),%edx
        movl    36(%esp),%ebx
        andl    $-16,%eax
-       jz      .L008ecb_ret
+       jz      .L012ecb_ret
        movl    240(%edx),%ecx
        testl   %ebx,%ebx
-       jz      .L009ecb_decrypt
+       jz      .L013ecb_decrypt
        movl    %edx,%ebp
        movl    %ecx,%ebx
        cmpl    $96,%eax
-       jb      .L010ecb_enc_tail
+       jb      .L014ecb_enc_tail
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -366,9 +425,9 @@ aesni_ecb_encrypt:
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
        subl    $96,%eax
-       jmp     .L011ecb_enc_loop6_enter
+       jmp     .L015ecb_enc_loop6_enter
 .align 16
-.L012ecb_enc_loop6:
+.L016ecb_enc_loop6:
        movups  %xmm2,(%edi)
        movdqu  (%esi),%xmm2
        movups  %xmm3,16(%edi)
@@ -383,12 +442,12 @@ aesni_ecb_encrypt:
        leal    96(%edi),%edi
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
-.L011ecb_enc_loop6_enter:
+.L015ecb_enc_loop6_enter:
        call    _aesni_encrypt6
        movl    %ebp,%edx
        movl    %ebx,%ecx
        subl    $96,%eax
-       jnc     .L012ecb_enc_loop6
+       jnc     .L016ecb_enc_loop6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
@@ -397,18 +456,18 @@ aesni_ecb_encrypt:
        movups  %xmm7,80(%edi)
        leal    96(%edi),%edi
        addl    $96,%eax
-       jz      .L008ecb_ret
-.L010ecb_enc_tail:
+       jz      .L012ecb_ret
+.L014ecb_enc_tail:
        movups  (%esi),%xmm2
        cmpl    $32,%eax
-       jb      .L013ecb_enc_one
+       jb      .L017ecb_enc_one
        movups  16(%esi),%xmm3
-       je      .L014ecb_enc_two
+       je      .L018ecb_enc_two
        movups  32(%esi),%xmm4
        cmpl    $64,%eax
-       jb      .L015ecb_enc_three
+       jb      .L019ecb_enc_three
        movups  48(%esi),%xmm5
-       je      .L016ecb_enc_four
+       je      .L020ecb_enc_four
        movups  64(%esi),%xmm6
        xorps   %xmm7,%xmm7
        call    _aesni_encrypt6
@@ -417,50 +476,49 @@ aesni_ecb_encrypt:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L013ecb_enc_one:
+.L017ecb_enc_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L017enc1_loop_3:
+.L021enc1_loop_3:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L017enc1_loop_3
+       jnz     .L021enc1_loop_3
 .byte  102,15,56,221,209
        movups  %xmm2,(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L014ecb_enc_two:
-       xorps   %xmm4,%xmm4
-       call    _aesni_encrypt3
+.L018ecb_enc_two:
+       call    _aesni_encrypt2
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L015ecb_enc_three:
+.L019ecb_enc_three:
        call    _aesni_encrypt3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L016ecb_enc_four:
+.L020ecb_enc_four:
        call    _aesni_encrypt4
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L009ecb_decrypt:
+.L013ecb_decrypt:
        movl    %edx,%ebp
        movl    %ecx,%ebx
        cmpl    $96,%eax
-       jb      .L018ecb_dec_tail
+       jb      .L022ecb_dec_tail
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -469,9 +527,9 @@ aesni_ecb_encrypt:
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
        subl    $96,%eax
-       jmp     .L019ecb_dec_loop6_enter
+       jmp     .L023ecb_dec_loop6_enter
 .align 16
-.L020ecb_dec_loop6:
+.L024ecb_dec_loop6:
        movups  %xmm2,(%edi)
        movdqu  (%esi),%xmm2
        movups  %xmm3,16(%edi)
@@ -486,12 +544,12 @@ aesni_ecb_encrypt:
        leal    96(%edi),%edi
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
-.L019ecb_dec_loop6_enter:
+.L023ecb_dec_loop6_enter:
        call    _aesni_decrypt6
        movl    %ebp,%edx
        movl    %ebx,%ecx
        subl    $96,%eax
-       jnc     .L020ecb_dec_loop6
+       jnc     .L024ecb_dec_loop6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
@@ -500,18 +558,18 @@ aesni_ecb_encrypt:
        movups  %xmm7,80(%edi)
        leal    96(%edi),%edi
        addl    $96,%eax
-       jz      .L008ecb_ret
-.L018ecb_dec_tail:
+       jz      .L012ecb_ret
+.L022ecb_dec_tail:
        movups  (%esi),%xmm2
        cmpl    $32,%eax
-       jb      .L021ecb_dec_one
+       jb      .L025ecb_dec_one
        movups  16(%esi),%xmm3
-       je      .L022ecb_dec_two
+       je      .L026ecb_dec_two
        movups  32(%esi),%xmm4
        cmpl    $64,%eax
-       jb      .L023ecb_dec_three
+       jb      .L027ecb_dec_three
        movups  48(%esi),%xmm5
-       je      .L024ecb_dec_four
+       je      .L028ecb_dec_four
        movups  64(%esi),%xmm6
        xorps   %xmm7,%xmm7
        call    _aesni_decrypt6
@@ -520,44 +578,51 @@ aesni_ecb_encrypt:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L021ecb_dec_one:
+.L025ecb_dec_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L025dec1_loop_4:
+.L029dec1_loop_4:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L025dec1_loop_4
+       jnz     .L029dec1_loop_4
 .byte  102,15,56,223,209
        movups  %xmm2,(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L022ecb_dec_two:
-       xorps   %xmm4,%xmm4
-       call    _aesni_decrypt3
+.L026ecb_dec_two:
+       call    _aesni_decrypt2
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L023ecb_dec_three:
+.L027ecb_dec_three:
        call    _aesni_decrypt3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     .L008ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L024ecb_dec_four:
+.L028ecb_dec_four:
        call    _aesni_decrypt4
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-.L008ecb_ret:
+.L012ecb_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -596,48 +661,56 @@ aesni_ccm64_encrypt_blocks:
        movl    %ebp,20(%esp)
        movl    %ebp,24(%esp)
        movl    %ebp,28(%esp)
-       shrl    $1,%ecx
+       shll    $4,%ecx
+       movl    $16,%ebx
        leal    (%edx),%ebp
        movdqa  (%esp),%xmm5
        movdqa  %xmm7,%xmm2
-       movl    %ecx,%ebx
+       leal    32(%edx,%ecx,1),%edx
+       subl    %ecx,%ebx
 .byte  102,15,56,0,253
-.L026ccm64_enc_outer:
+.L030ccm64_enc_outer:
        movups  (%ebp),%xmm0
        movl    %ebx,%ecx
        movups  (%esi),%xmm6
        xorps   %xmm0,%xmm2
        movups  16(%ebp),%xmm1
        xorps   %xmm6,%xmm0
-       leal    32(%ebp),%edx
        xorps   %xmm0,%xmm3
-       movups  (%edx),%xmm0
-.L027ccm64_enc2_loop:
+       movups  32(%ebp),%xmm0
+.L031ccm64_enc2_loop:
 .byte  102,15,56,220,209
-       decl    %ecx
 .byte  102,15,56,220,217
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,220,208
-       leal    32(%edx),%edx
 .byte  102,15,56,220,216
-       movups  (%edx),%xmm0
-       jnz     .L027ccm64_enc2_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L031ccm64_enc2_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
        paddq   16(%esp),%xmm7
+       decl    %eax
 .byte  102,15,56,221,208
 .byte  102,15,56,221,216
-       decl    %eax
        leal    16(%esi),%esi
        xorps   %xmm2,%xmm6
        movdqa  %xmm7,%xmm2
        movups  %xmm6,(%edi)
-       leal    16(%edi),%edi
 .byte  102,15,56,0,213
-       jnz     .L026ccm64_enc_outer
+       leal    16(%edi),%edi
+       jnz     .L030ccm64_enc_outer
        movl    48(%esp),%esp
        movl    40(%esp),%edi
        movups  %xmm3,(%edi)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -685,71 +758,82 @@ aesni_ccm64_decrypt_blocks:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L028enc1_loop_5:
+.L032enc1_loop_5:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L028enc1_loop_5
+       jnz     .L032enc1_loop_5
 .byte  102,15,56,221,209
+       shll    $4,%ebx
+       movl    $16,%ecx
        movups  (%esi),%xmm6
        paddq   16(%esp),%xmm7
        leal    16(%esi),%esi
-       jmp     .L029ccm64_dec_outer
+       subl    %ebx,%ecx
+       leal    32(%ebp,%ebx,1),%edx
+       movl    %ecx,%ebx
+       jmp     .L033ccm64_dec_outer
 .align 16
-.L029ccm64_dec_outer:
+.L033ccm64_dec_outer:
        xorps   %xmm2,%xmm6
        movdqa  %xmm7,%xmm2
-       movl    %ebx,%ecx
        movups  %xmm6,(%edi)
        leal    16(%edi),%edi
 .byte  102,15,56,0,213
        subl    $1,%eax
-       jz      .L030ccm64_dec_break
+       jz      .L034ccm64_dec_break
        movups  (%ebp),%xmm0
-       shrl    $1,%ecx
+       movl    %ebx,%ecx
        movups  16(%ebp),%xmm1
        xorps   %xmm0,%xmm6
-       leal    32(%ebp),%edx
        xorps   %xmm0,%xmm2
        xorps   %xmm6,%xmm3
-       movups  (%edx),%xmm0
-.L031ccm64_dec2_loop:
+       movups  32(%ebp),%xmm0
+.L035ccm64_dec2_loop:
 .byte  102,15,56,220,209
-       decl    %ecx
 .byte  102,15,56,220,217
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,220,208
-       leal    32(%edx),%edx
 .byte  102,15,56,220,216
-       movups  (%edx),%xmm0
-       jnz     .L031ccm64_dec2_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     .L035ccm64_dec2_loop
        movups  (%esi),%xmm6
        paddq   16(%esp),%xmm7
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       leal    16(%esi),%esi
 .byte  102,15,56,221,208
 .byte  102,15,56,221,216
-       jmp     .L029ccm64_dec_outer
+       leal    16(%esi),%esi
+       jmp     .L033ccm64_dec_outer
 .align 16
-.L030ccm64_dec_break:
+.L034ccm64_dec_break:
+       movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        xorps   %xmm0,%xmm6
        leal    32(%edx),%edx
        xorps   %xmm6,%xmm3
-.L032enc1_loop_6:
+.L036enc1_loop_6:
 .byte  102,15,56,220,217
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L032enc1_loop_6
+       jnz     .L036enc1_loop_6
 .byte  102,15,56,221,217
        movl    48(%esp),%esp
        movl    40(%esp),%edi
        movups  %xmm3,(%edi)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -775,7 +859,7 @@ aesni_ctr32_encrypt_blocks:
        andl    $-16,%esp
        movl    %ebp,80(%esp)
        cmpl    $1,%eax
-       je      .L033ctr32_one_shortcut
+       je      .L037ctr32_one_shortcut
        movdqu  (%ebx),%xmm7
        movl    $202182159,(%esp)
        movl    $134810123,4(%esp)
@@ -791,63 +875,59 @@ aesni_ctr32_encrypt_blocks:
 .byte  102,15,58,34,253,3
        movl    240(%edx),%ecx
        bswap   %ebx
-       pxor    %xmm1,%xmm1
        pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movdqa  (%esp),%xmm2
-.byte  102,15,58,34,203,0
+.byte  102,15,58,34,195,0
        leal    3(%ebx),%ebp
-.byte  102,15,58,34,197,0
+.byte  102,15,58,34,205,0
        incl    %ebx
-.byte  102,15,58,34,203,1
+.byte  102,15,58,34,195,1
        incl    %ebp
-.byte  102,15,58,34,197,1
+.byte  102,15,58,34,205,1
        incl    %ebx
-.byte  102,15,58,34,203,2
+.byte  102,15,58,34,195,2
        incl    %ebp
-.byte  102,15,58,34,197,2
-       movdqa  %xmm1,48(%esp)
-.byte  102,15,56,0,202
-       movdqa  %xmm0,64(%esp)
+.byte  102,15,58,34,205,2
+       movdqa  %xmm0,48(%esp)
 .byte  102,15,56,0,194
-       pshufd  $192,%xmm1,%xmm2
-       pshufd  $128,%xmm1,%xmm3
+       movdqu  (%edx),%xmm6
+       movdqa  %xmm1,64(%esp)
+.byte  102,15,56,0,202
+       pshufd  $192,%xmm0,%xmm2
+       pshufd  $128,%xmm0,%xmm3
        cmpl    $6,%eax
-       jb      .L034ctr32_tail
+       jb      .L038ctr32_tail
+       pxor    %xmm6,%xmm7
+       shll    $4,%ecx
+       movl    $16,%ebx
        movdqa  %xmm7,32(%esp)
-       shrl    $1,%ecx
        movl    %edx,%ebp
-       movl    %ecx,%ebx
+       subl    %ecx,%ebx
+       leal    32(%edx,%ecx,1),%edx
        subl    $6,%eax
-       jmp     .L035ctr32_loop6
-.align 16
-.L035ctr32_loop6:
-       pshufd  $64,%xmm1,%xmm4
-       movdqa  32(%esp),%xmm1
-       pshufd  $192,%xmm0,%xmm5
-       por     %xmm1,%xmm2
-       pshufd  $128,%xmm0,%xmm6
-       por     %xmm1,%xmm3
-       pshufd  $64,%xmm0,%xmm7
-       por     %xmm1,%xmm4
-       por     %xmm1,%xmm5
-       por     %xmm1,%xmm6
-       por     %xmm1,%xmm7
-       movups  (%ebp),%xmm0
-       movups  16(%ebp),%xmm1
-       leal    32(%ebp),%edx
-       decl    %ecx
+       jmp     .L039ctr32_loop6
+.align 16
+.L039ctr32_loop6:
+       pshufd  $64,%xmm0,%xmm4
+       movdqa  32(%esp),%xmm0
+       pshufd  $192,%xmm1,%xmm5
        pxor    %xmm0,%xmm2
+       pshufd  $128,%xmm1,%xmm6
        pxor    %xmm0,%xmm3
-.byte  102,15,56,220,209
+       pshufd  $64,%xmm1,%xmm7
+       movups  16(%ebp),%xmm1
        pxor    %xmm0,%xmm4
-.byte  102,15,56,220,217
        pxor    %xmm0,%xmm5
-.byte  102,15,56,220,225
+.byte  102,15,56,220,209
        pxor    %xmm0,%xmm6
-.byte  102,15,56,220,233
        pxor    %xmm0,%xmm7
+.byte  102,15,56,220,217
+       movups  32(%ebp),%xmm0
+       movl    %ebx,%ecx
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
 .byte  102,15,56,220,241
-       movups  (%edx),%xmm0
 .byte  102,15,56,220,249
        call    .L_aesni_encrypt6_enter
        movups  (%esi),%xmm1
@@ -858,51 +938,51 @@ aesni_ctr32_encrypt_blocks:
        movups  %xmm2,(%edi)
        movdqa  16(%esp),%xmm0
        xorps   %xmm1,%xmm4
-       movdqa  48(%esp),%xmm1
+       movdqa  64(%esp),%xmm1
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
        paddd   %xmm0,%xmm1
-       paddd   64(%esp),%xmm0
+       paddd   48(%esp),%xmm0
        movdqa  (%esp),%xmm2
        movups  48(%esi),%xmm3
        movups  64(%esi),%xmm4
        xorps   %xmm3,%xmm5
        movups  80(%esi),%xmm3
        leal    96(%esi),%esi
-       movdqa  %xmm1,48(%esp)
-.byte  102,15,56,0,202
+       movdqa  %xmm0,48(%esp)
+.byte  102,15,56,0,194
        xorps   %xmm4,%xmm6
        movups  %xmm5,48(%edi)
        xorps   %xmm3,%xmm7
-       movdqa  %xmm0,64(%esp)
-.byte  102,15,56,0,194
+       movdqa  %xmm1,64(%esp)
+.byte  102,15,56,0,202
        movups  %xmm6,64(%edi)
-       pshufd  $192,%xmm1,%xmm2
+       pshufd  $192,%xmm0,%xmm2
        movups  %xmm7,80(%edi)
        leal    96(%edi),%edi
-       movl    %ebx,%ecx
-       pshufd  $128,%xmm1,%xmm3
+       pshufd  $128,%xmm0,%xmm3
        subl    $6,%eax
-       jnc     .L035ctr32_loop6
+       jnc     .L039ctr32_loop6
        addl    $6,%eax
-       jz      .L036ctr32_ret
+       jz      .L040ctr32_ret
+       movdqu  (%ebp),%xmm7
        movl    %ebp,%edx
-       leal    1(,%ecx,2),%ecx
-       movdqa  32(%esp),%xmm7
-.L034ctr32_tail:
+       pxor    32(%esp),%xmm7
+       movl    240(%ebp),%ecx
+.L038ctr32_tail:
        por     %xmm7,%xmm2
        cmpl    $2,%eax
-       jb      .L037ctr32_one
-       pshufd  $64,%xmm1,%xmm4
+       jb      .L041ctr32_one
+       pshufd  $64,%xmm0,%xmm4
        por     %xmm7,%xmm3
-       je      .L038ctr32_two
-       pshufd  $192,%xmm0,%xmm5
+       je      .L042ctr32_two
+       pshufd  $192,%xmm1,%xmm5
        por     %xmm7,%xmm4
        cmpl    $4,%eax
-       jb      .L039ctr32_three
-       pshufd  $128,%xmm0,%xmm6
+       jb      .L043ctr32_three
+       pshufd  $128,%xmm1,%xmm6
        por     %xmm7,%xmm5
-       je      .L040ctr32_four
+       je      .L044ctr32_four
        por     %xmm7,%xmm6
        call    _aesni_encrypt6
        movups  (%esi),%xmm1
@@ -920,39 +1000,39 @@ aesni_ctr32_encrypt_blocks:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     .L036ctr32_ret
+       jmp     .L040ctr32_ret
 .align 16
-.L033ctr32_one_shortcut:
+.L037ctr32_one_shortcut:
        movups  (%ebx),%xmm2
        movl    240(%edx),%ecx
-.L037ctr32_one:
+.L041ctr32_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L041enc1_loop_7:
+.L045enc1_loop_7:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L041enc1_loop_7
+       jnz     .L045enc1_loop_7
 .byte  102,15,56,221,209
        movups  (%esi),%xmm6
        xorps   %xmm2,%xmm6
        movups  %xmm6,(%edi)
-       jmp     .L036ctr32_ret
+       jmp     .L040ctr32_ret
 .align 16
-.L038ctr32_two:
-       call    _aesni_encrypt3
+.L042ctr32_two:
+       call    _aesni_encrypt2
        movups  (%esi),%xmm5
        movups  16(%esi),%xmm6
        xorps   %xmm5,%xmm2
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     .L036ctr32_ret
+       jmp     .L040ctr32_ret
 .align 16
-.L039ctr32_three:
+.L043ctr32_three:
        call    _aesni_encrypt3
        movups  (%esi),%xmm5
        movups  16(%esi),%xmm6
@@ -963,9 +1043,9 @@ aesni_ctr32_encrypt_blocks:
        xorps   %xmm7,%xmm4
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     .L036ctr32_ret
+       jmp     .L040ctr32_ret
 .align 16
-.L040ctr32_four:
+.L044ctr32_four:
        call    _aesni_encrypt4
        movups  (%esi),%xmm6
        movups  16(%esi),%xmm7
@@ -979,7 +1059,18 @@ aesni_ctr32_encrypt_blocks:
        xorps   %xmm0,%xmm5
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-.L036ctr32_ret:
+.L040ctr32_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
        movl    80(%esp),%esp
        popl    %edi
        popl    %esi
@@ -1004,12 +1095,12 @@ aesni_xts_encrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L042enc1_loop_8:
+.L046enc1_loop_8:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L042enc1_loop_8
+       jnz     .L046enc1_loop_8
 .byte  102,15,56,221,209
        movl    20(%esp),%esi
        movl    24(%esp),%edi
@@ -1033,12 +1124,14 @@ aesni_xts_encrypt:
        movl    %edx,%ebp
        movl    %ecx,%ebx
        subl    $96,%eax
-       jc      .L043xts_enc_short
-       shrl    $1,%ecx
-       movl    %ecx,%ebx
-       jmp     .L044xts_enc_loop6
+       jc      .L047xts_enc_short
+       shll    $4,%ecx
+       movl    $16,%ebx
+       subl    %ecx,%ebx
+       leal    32(%edx,%ecx,1),%edx
+       jmp     .L048xts_enc_loop6
 .align 16
-.L044xts_enc_loop6:
+.L048xts_enc_loop6:
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,(%esp)
@@ -1074,6 +1167,7 @@ aesni_xts_encrypt:
        pand    %xmm3,%xmm7
        movups  (%esi),%xmm2
        pxor    %xmm1,%xmm7
+       movl    %ebx,%ecx
        movdqu  16(%esi),%xmm3
        xorps   %xmm0,%xmm2
        movdqu  32(%esi),%xmm4
@@ -1089,19 +1183,17 @@ aesni_xts_encrypt:
        movdqa  %xmm7,80(%esp)
        pxor    %xmm1,%xmm7
        movups  16(%ebp),%xmm1
-       leal    32(%ebp),%edx
        pxor    16(%esp),%xmm3
-.byte  102,15,56,220,209
        pxor    32(%esp),%xmm4
-.byte  102,15,56,220,217
+.byte  102,15,56,220,209
        pxor    48(%esp),%xmm5
-       decl    %ecx
-.byte  102,15,56,220,225
        pxor    64(%esp),%xmm6
-.byte  102,15,56,220,233
+.byte  102,15,56,220,217
        pxor    %xmm0,%xmm7
+       movups  32(%ebp),%xmm0
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
 .byte  102,15,56,220,241
-       movups  (%edx),%xmm0
 .byte  102,15,56,220,249
        call    .L_aesni_encrypt6_enter
        movdqa  80(%esp),%xmm1
@@ -1126,26 +1218,25 @@ aesni_xts_encrypt:
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
-       movl    %ebx,%ecx
        pxor    %xmm2,%xmm1
        subl    $96,%eax
-       jnc     .L044xts_enc_loop6
-       leal    1(,%ecx,2),%ecx
+       jnc     .L048xts_enc_loop6
+       movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movl    %ecx,%ebx
-.L043xts_enc_short:
+.L047xts_enc_short:
        addl    $96,%eax
-       jz      .L045xts_enc_done6x
+       jz      .L049xts_enc_done6x
        movdqa  %xmm1,%xmm5
        cmpl    $32,%eax
-       jb      .L046xts_enc_one
+       jb      .L050xts_enc_one
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-       je      .L047xts_enc_two
+       je      .L051xts_enc_two
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm6
@@ -1154,7 +1245,7 @@ aesni_xts_encrypt:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        cmpl    $64,%eax
-       jb      .L048xts_enc_three
+       jb      .L052xts_enc_three
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm7
@@ -1164,7 +1255,7 @@ aesni_xts_encrypt:
        pxor    %xmm2,%xmm1
        movdqa  %xmm5,(%esp)
        movdqa  %xmm6,16(%esp)
-       je      .L049xts_enc_four
+       je      .L053xts_enc_four
        movdqa  %xmm7,32(%esp)
        pshufd  $19,%xmm0,%xmm7
        movdqa  %xmm1,48(%esp)
@@ -1196,9 +1287,9 @@ aesni_xts_encrypt:
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
-       jmp     .L050xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L046xts_enc_one:
+.L050xts_enc_one:
        movups  (%esi),%xmm2
        leal    16(%esi),%esi
        xorps   %xmm5,%xmm2
@@ -1206,37 +1297,36 @@ aesni_xts_encrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L051enc1_loop_9:
+.L055enc1_loop_9:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L051enc1_loop_9
+       jnz     .L055enc1_loop_9
 .byte  102,15,56,221,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        movdqa  %xmm5,%xmm1
-       jmp     .L050xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L047xts_enc_two:
+.L051xts_enc_two:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
        leal    32(%esi),%esi
        xorps   %xmm5,%xmm2
        xorps   %xmm6,%xmm3
-       xorps   %xmm4,%xmm4
-       call    _aesni_encrypt3
+       call    _aesni_encrypt2
        xorps   %xmm5,%xmm2
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        leal    32(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     .L050xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L048xts_enc_three:
+.L052xts_enc_three:
        movaps  %xmm1,%xmm7
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1254,9 +1344,9 @@ aesni_xts_encrypt:
        movups  %xmm4,32(%edi)
        leal    48(%edi),%edi
        movdqa  %xmm7,%xmm1
-       jmp     .L050xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L049xts_enc_four:
+.L053xts_enc_four:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1278,28 +1368,28 @@ aesni_xts_encrypt:
        movups  %xmm5,48(%edi)
        leal    64(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     .L050xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L045xts_enc_done6x:
+.L049xts_enc_done6x:
        movl    112(%esp),%eax
        andl    $15,%eax
-       jz      .L052xts_enc_ret
+       jz      .L056xts_enc_ret
        movdqa  %xmm1,%xmm5
        movl    %eax,112(%esp)
-       jmp     .L053xts_enc_steal
+       jmp     .L057xts_enc_steal
 .align 16
-.L050xts_enc_done:
+.L054xts_enc_done:
        movl    112(%esp),%eax
        pxor    %xmm0,%xmm0
        andl    $15,%eax
-       jz      .L052xts_enc_ret
+       jz      .L056xts_enc_ret
        pcmpgtd %xmm1,%xmm0
        movl    %eax,112(%esp)
        pshufd  $19,%xmm0,%xmm5
        paddq   %xmm1,%xmm1
        pand    96(%esp),%xmm5
        pxor    %xmm1,%xmm5
-.L053xts_enc_steal:
+.L057xts_enc_steal:
        movzbl  (%esi),%ecx
        movzbl  -16(%edi),%edx
        leal    1(%esi),%esi
@@ -1307,7 +1397,7 @@ aesni_xts_encrypt:
        movb    %dl,(%edi)
        leal    1(%edi),%edi
        subl    $1,%eax
-       jnz     .L053xts_enc_steal
+       jnz     .L057xts_enc_steal
        subl    112(%esp),%edi
        movl    %ebp,%edx
        movl    %ebx,%ecx
@@ -1317,16 +1407,30 @@ aesni_xts_encrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L054enc1_loop_10:
+.L058enc1_loop_10:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L054enc1_loop_10
+       jnz     .L058enc1_loop_10
 .byte  102,15,56,221,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,-16(%edi)
-.L052xts_enc_ret:
+.L056xts_enc_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       movdqa  %xmm0,(%esp)
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm0,16(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm0,80(%esp)
        movl    116(%esp),%esp
        popl    %edi
        popl    %esi
@@ -1351,12 +1455,12 @@ aesni_xts_decrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L055enc1_loop_11:
+.L059enc1_loop_11:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L055enc1_loop_11
+       jnz     .L059enc1_loop_11
 .byte  102,15,56,221,209
        movl    20(%esp),%esi
        movl    24(%esp),%edi
@@ -1385,12 +1489,14 @@ aesni_xts_decrypt:
        pcmpgtd %xmm1,%xmm0
        andl    $-16,%eax
        subl    $96,%eax
-       jc      .L056xts_dec_short
-       shrl    $1,%ecx
-       movl    %ecx,%ebx
-       jmp     .L057xts_dec_loop6
+       jc      .L060xts_dec_short
+       shll    $4,%ecx
+       movl    $16,%ebx
+       subl    %ecx,%ebx
+       leal    32(%edx,%ecx,1),%edx
+       jmp     .L061xts_dec_loop6
 .align 16
-.L057xts_dec_loop6:
+.L061xts_dec_loop6:
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,(%esp)
@@ -1426,6 +1532,7 @@ aesni_xts_decrypt:
        pand    %xmm3,%xmm7
        movups  (%esi),%xmm2
        pxor    %xmm1,%xmm7
+       movl    %ebx,%ecx
        movdqu  16(%esi),%xmm3
        xorps   %xmm0,%xmm2
        movdqu  32(%esi),%xmm4
@@ -1441,19 +1548,17 @@ aesni_xts_decrypt:
        movdqa  %xmm7,80(%esp)
        pxor    %xmm1,%xmm7
        movups  16(%ebp),%xmm1
-       leal    32(%ebp),%edx
        pxor    16(%esp),%xmm3
-.byte  102,15,56,222,209
        pxor    32(%esp),%xmm4
-.byte  102,15,56,222,217
+.byte  102,15,56,222,209
        pxor    48(%esp),%xmm5
-       decl    %ecx
-.byte  102,15,56,222,225
        pxor    64(%esp),%xmm6
-.byte  102,15,56,222,233
+.byte  102,15,56,222,217
        pxor    %xmm0,%xmm7
+       movups  32(%ebp),%xmm0
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
 .byte  102,15,56,222,241
-       movups  (%edx),%xmm0
 .byte  102,15,56,222,249
        call    .L_aesni_decrypt6_enter
        movdqa  80(%esp),%xmm1
@@ -1478,26 +1583,25 @@ aesni_xts_decrypt:
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
-       movl    %ebx,%ecx
        pxor    %xmm2,%xmm1
        subl    $96,%eax
-       jnc     .L057xts_dec_loop6
-       leal    1(,%ecx,2),%ecx
+       jnc     .L061xts_dec_loop6
+       movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movl    %ecx,%ebx
-.L056xts_dec_short:
+.L060xts_dec_short:
        addl    $96,%eax
-       jz      .L058xts_dec_done6x
+       jz      .L062xts_dec_done6x
        movdqa  %xmm1,%xmm5
        cmpl    $32,%eax
-       jb      .L059xts_dec_one
+       jb      .L063xts_dec_one
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-       je      .L060xts_dec_two
+       je      .L064xts_dec_two
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm6
@@ -1506,7 +1610,7 @@ aesni_xts_decrypt:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        cmpl    $64,%eax
-       jb      .L061xts_dec_three
+       jb      .L065xts_dec_three
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm7
@@ -1516,7 +1620,7 @@ aesni_xts_decrypt:
        pxor    %xmm2,%xmm1
        movdqa  %xmm5,(%esp)
        movdqa  %xmm6,16(%esp)
-       je      .L062xts_dec_four
+       je      .L066xts_dec_four
        movdqa  %xmm7,32(%esp)
        pshufd  $19,%xmm0,%xmm7
        movdqa  %xmm1,48(%esp)
@@ -1548,9 +1652,9 @@ aesni_xts_decrypt:
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
-       jmp     .L063xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L059xts_dec_one:
+.L063xts_dec_one:
        movups  (%esi),%xmm2
        leal    16(%esi),%esi
        xorps   %xmm5,%xmm2
@@ -1558,36 +1662,36 @@ aesni_xts_decrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L064dec1_loop_12:
+.L068dec1_loop_12:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L064dec1_loop_12
+       jnz     .L068dec1_loop_12
 .byte  102,15,56,223,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        movdqa  %xmm5,%xmm1
-       jmp     .L063xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L060xts_dec_two:
+.L064xts_dec_two:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
        leal    32(%esi),%esi
        xorps   %xmm5,%xmm2
        xorps   %xmm6,%xmm3
-       call    _aesni_decrypt3
+       call    _aesni_decrypt2
        xorps   %xmm5,%xmm2
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        leal    32(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     .L063xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L061xts_dec_three:
+.L065xts_dec_three:
        movaps  %xmm1,%xmm7
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1605,9 +1709,9 @@ aesni_xts_decrypt:
        movups  %xmm4,32(%edi)
        leal    48(%edi),%edi
        movdqa  %xmm7,%xmm1
-       jmp     .L063xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L062xts_dec_four:
+.L066xts_dec_four:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1629,20 +1733,20 @@ aesni_xts_decrypt:
        movups  %xmm5,48(%edi)
        leal    64(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     .L063xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L058xts_dec_done6x:
+.L062xts_dec_done6x:
        movl    112(%esp),%eax
        andl    $15,%eax
-       jz      .L065xts_dec_ret
+       jz      .L069xts_dec_ret
        movl    %eax,112(%esp)
-       jmp     .L066xts_dec_only_one_more
+       jmp     .L070xts_dec_only_one_more
 .align 16
-.L063xts_dec_done:
+.L067xts_dec_done:
        movl    112(%esp),%eax
        pxor    %xmm0,%xmm0
        andl    $15,%eax
-       jz      .L065xts_dec_ret
+       jz      .L069xts_dec_ret
        pcmpgtd %xmm1,%xmm0
        movl    %eax,112(%esp)
        pshufd  $19,%xmm0,%xmm2
@@ -1652,7 +1756,7 @@ aesni_xts_decrypt:
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-.L066xts_dec_only_one_more:
+.L070xts_dec_only_one_more:
        pshufd  $19,%xmm0,%xmm5
        movdqa  %xmm1,%xmm6
        paddq   %xmm1,%xmm1
@@ -1666,16 +1770,16 @@ aesni_xts_decrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L067dec1_loop_13:
+.L071dec1_loop_13:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L067dec1_loop_13
+       jnz     .L071dec1_loop_13
 .byte  102,15,56,223,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
-.L068xts_dec_steal:
+.L072xts_dec_steal:
        movzbl  16(%esi),%ecx
        movzbl  (%edi),%edx
        leal    1(%esi),%esi
@@ -1683,7 +1787,7 @@ aesni_xts_decrypt:
        movb    %dl,16(%edi)
        leal    1(%edi),%edi
        subl    $1,%eax
-       jnz     .L068xts_dec_steal
+       jnz     .L072xts_dec_steal
        subl    112(%esp),%edi
        movl    %ebp,%edx
        movl    %ebx,%ecx
@@ -1693,16 +1797,30 @@ aesni_xts_decrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L069dec1_loop_14:
+.L073dec1_loop_14:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L069dec1_loop_14
+       jnz     .L073dec1_loop_14
 .byte  102,15,56,223,209
        xorps   %xmm6,%xmm2
        movups  %xmm2,(%edi)
-.L065xts_dec_ret:
+.L069xts_dec_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       movdqa  %xmm0,(%esp)
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm0,16(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm0,80(%esp)
        movl    116(%esp),%esp
        popl    %edi
        popl    %esi
@@ -1710,89 +1828,880 @@ aesni_xts_decrypt:
        popl    %ebp
        ret
 .size  aesni_xts_decrypt,.-.L_aesni_xts_decrypt_begin
-.globl aesni_cbc_encrypt
-.type  aesni_cbc_encrypt,@function
+.globl aesni_ocb_encrypt
+.type  aesni_ocb_encrypt,@function
 .align 16
-aesni_cbc_encrypt:
-.L_aesni_cbc_encrypt_begin:
+aesni_ocb_encrypt:
+.L_aesni_ocb_encrypt_begin:
        pushl   %ebp
        pushl   %ebx
        pushl   %esi
        pushl   %edi
+       movl    40(%esp),%ecx
+       movl    48(%esp),%ebx
        movl    20(%esp),%esi
-       movl    %esp,%ebx
        movl    24(%esp),%edi
-       subl    $24,%ebx
        movl    28(%esp),%eax
-       andl    $-16,%ebx
        movl    32(%esp),%edx
+       movdqu  (%ecx),%xmm0
        movl    36(%esp),%ebp
-       testl   %eax,%eax
-       jz      .L070cbc_abort
-       cmpl    $0,40(%esp)
-       xchgl   %esp,%ebx
-       movups  (%ebp),%xmm7
+       movdqu  (%ebx),%xmm1
+       movl    44(%esp),%ebx
+       movl    %esp,%ecx
+       subl    $132,%esp
+       andl    $-16,%esp
+       subl    %esi,%edi
+       shll    $4,%eax
+       leal    -96(%esi,%eax,1),%eax
+       movl    %edi,120(%esp)
+       movl    %eax,124(%esp)
+       movl    %ecx,128(%esp)
        movl    240(%edx),%ecx
-       movl    %edx,%ebp
-       movl    %ebx,16(%esp)
-       movl    %ecx,%ebx
-       je      .L071cbc_decrypt
-       movaps  %xmm7,%xmm2
-       cmpl    $16,%eax
-       jb      .L072cbc_enc_tail
-       subl    $16,%eax
-       jmp     .L073cbc_enc_loop
-.align 16
-.L073cbc_enc_loop:
-       movups  (%esi),%xmm7
+       testl   $1,%ebp
+       jnz     .L074odd
+       bsfl    %ebp,%eax
+       addl    $1,%ebp
+       shll    $4,%eax
+       movdqu  (%ebx,%eax,1),%xmm7
+       movl    %edx,%eax
+       movdqu  (%esi),%xmm2
        leal    16(%esi),%esi
+       pxor    %xmm0,%xmm7
+       pxor    %xmm2,%xmm1
+       pxor    %xmm7,%xmm2
+       movdqa  %xmm1,%xmm6
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
-       xorps   %xmm0,%xmm7
        leal    32(%edx),%edx
-       xorps   %xmm7,%xmm2
-.L074enc1_loop_15:
+       xorps   %xmm0,%xmm2
+.L075enc1_loop_15:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L074enc1_loop_15
+       jnz     .L075enc1_loop_15
 .byte  102,15,56,221,209
-       movl    %ebx,%ecx
-       movl    %ebp,%edx
-       movups  %xmm2,(%edi)
-       leal    16(%edi),%edi
-       subl    $16,%eax
-       jnc     .L073cbc_enc_loop
-       addl    $16,%eax
-       jnz     .L072cbc_enc_tail
-       movaps  %xmm2,%xmm7
-       jmp     .L075cbc_ret
-.L072cbc_enc_tail:
-       movl    %eax,%ecx
-.long  2767451785
-       movl    $16,%ecx
-       subl    %eax,%ecx
-       xorl    %eax,%eax
-.long  2868115081
-       leal    -16(%edi),%edi
-       movl    %ebx,%ecx
-       movl    %edi,%esi
-       movl    %ebp,%edx
-       jmp     .L073cbc_enc_loop
-.align 16
-.L071cbc_decrypt:
-       cmpl    $80,%eax
-       jbe     .L076cbc_dec_tail
-       movaps  %xmm7,(%esp)
-       subl    $80,%eax
-       jmp     .L077cbc_dec_loop6_enter
-.align 16
-.L078cbc_dec_loop6:
-       movaps  %xmm0,(%esp)
-       movups  %xmm7,(%edi)
+       xorps   %xmm7,%xmm2
+       movdqa  %xmm7,%xmm0
+       movdqa  %xmm6,%xmm1
+       movups  %xmm2,-16(%edi,%esi,1)
+       movl    240(%eax),%ecx
+       movl    %eax,%edx
+       movl    124(%esp),%eax
+.L074odd:
+       shll    $4,%ecx
+       movl    $16,%edi
+       subl    %ecx,%edi
+       movl    %edx,112(%esp)
+       leal    32(%edx,%ecx,1),%edx
+       movl    %edi,116(%esp)
+       cmpl    %eax,%esi
+       ja      .L076short
+       jmp     .L077grandloop
+.align 32
+.L077grandloop:
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       leal    5(%ebp),%edi
+       addl    $6,%ebp
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       bsfl    %edi,%edi
+       shll    $4,%ecx
+       shll    $4,%eax
+       shll    $4,%edi
+       movdqu  (%ebx),%xmm2
+       movdqu  (%ebx,%ecx,1),%xmm3
+       movl    116(%esp),%ecx
+       movdqa  %xmm2,%xmm4
+       movdqu  (%ebx,%eax,1),%xmm5
+       movdqa  %xmm2,%xmm6
+       movdqu  (%ebx,%edi,1),%xmm7
+       pxor    %xmm0,%xmm2
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm3,%xmm4
+       movdqa  %xmm3,16(%esp)
+       pxor    %xmm4,%xmm5
+       movdqa  %xmm4,32(%esp)
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm5,48(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm6,64(%esp)
+       movdqa  %xmm7,80(%esp)
+       movups  -48(%edx,%ecx,1),%xmm0
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movdqu  64(%esi),%xmm6
+       movdqu  80(%esi),%xmm7
+       leal    96(%esi),%esi
+       pxor    %xmm2,%xmm1
+       pxor    %xmm0,%xmm2
+       pxor    %xmm3,%xmm1
+       pxor    %xmm0,%xmm3
+       pxor    %xmm4,%xmm1
+       pxor    %xmm0,%xmm4
+       pxor    %xmm5,%xmm1
+       pxor    %xmm0,%xmm5
+       pxor    %xmm6,%xmm1
+       pxor    %xmm0,%xmm6
+       pxor    %xmm7,%xmm1
+       pxor    %xmm0,%xmm7
+       movdqa  %xmm1,96(%esp)
+       movups  -32(%edx,%ecx,1),%xmm1
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       pxor    80(%esp),%xmm7
+       movups  -16(%edx,%ecx,1),%xmm0
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+.byte  102,15,56,220,241
+.byte  102,15,56,220,249
+       movl    120(%esp),%edi
+       movl    124(%esp),%eax
+       call    .L_aesni_encrypt6_enter
+       movdqa  80(%esp),%xmm0
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       pxor    %xmm0,%xmm7
+       movdqa  96(%esp),%xmm1
+       movdqu  %xmm2,-96(%edi,%esi,1)
+       movdqu  %xmm3,-80(%edi,%esi,1)
+       movdqu  %xmm4,-64(%edi,%esi,1)
+       movdqu  %xmm5,-48(%edi,%esi,1)
+       movdqu  %xmm6,-32(%edi,%esi,1)
+       movdqu  %xmm7,-16(%edi,%esi,1)
+       cmpl    %eax,%esi
+       jb      .L077grandloop
+.L076short:
+       addl    $96,%eax
+       subl    %esi,%eax
+       jz      .L078done
+       cmpl    $32,%eax
+       jb      .L079one
+       je      .L080two
+       cmpl    $64,%eax
+       jb      .L081three
+       je      .L082four
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       shll    $4,%ecx
+       shll    $4,%eax
+       movdqu  (%ebx),%xmm2
+       movdqu  (%ebx,%ecx,1),%xmm3
+       movl    116(%esp),%ecx
+       movdqa  %xmm2,%xmm4
+       movdqu  (%ebx,%eax,1),%xmm5
+       movdqa  %xmm2,%xmm6
+       pxor    %xmm0,%xmm2
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm3,%xmm4
+       movdqa  %xmm3,16(%esp)
+       pxor    %xmm4,%xmm5
+       movdqa  %xmm4,32(%esp)
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm5,48(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm6,64(%esp)
+       movups  -48(%edx,%ecx,1),%xmm0
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movdqu  64(%esi),%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm2,%xmm1
+       pxor    %xmm0,%xmm2
+       pxor    %xmm3,%xmm1
+       pxor    %xmm0,%xmm3
+       pxor    %xmm4,%xmm1
+       pxor    %xmm0,%xmm4
+       pxor    %xmm5,%xmm1
+       pxor    %xmm0,%xmm5
+       pxor    %xmm6,%xmm1
+       pxor    %xmm0,%xmm6
+       movdqa  %xmm1,96(%esp)
+       movups  -32(%edx,%ecx,1),%xmm1
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       movups  -16(%edx,%ecx,1),%xmm0
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+.byte  102,15,56,220,241
+.byte  102,15,56,220,249
+       movl    120(%esp),%edi
+       call    .L_aesni_encrypt6_enter
+       movdqa  64(%esp),%xmm0
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    %xmm0,%xmm6
+       movdqa  96(%esp),%xmm1
+       movdqu  %xmm2,(%edi,%esi,1)
+       movdqu  %xmm3,16(%edi,%esi,1)
+       movdqu  %xmm4,32(%edi,%esi,1)
+       movdqu  %xmm5,48(%edi,%esi,1)
+       movdqu  %xmm6,64(%edi,%esi,1)
+       jmp     .L078done
+.align 16
+.L079one:
+       movdqu  (%ebx),%xmm7
+       movl    112(%esp),%edx
+       movdqu  (%esi),%xmm2
+       movl    240(%edx),%ecx
+       pxor    %xmm0,%xmm7
+       pxor    %xmm2,%xmm1
+       pxor    %xmm7,%xmm2
+       movdqa  %xmm1,%xmm6
+       movl    120(%esp),%edi
+       movups  (%edx),%xmm0
+       movups  16(%edx),%xmm1
+       leal    32(%edx),%edx
+       xorps   %xmm0,%xmm2
+.L083enc1_loop_16:
+.byte  102,15,56,220,209
+       decl    %ecx
+       movups  (%edx),%xmm1
+       leal    16(%edx),%edx
+       jnz     .L083enc1_loop_16
+.byte  102,15,56,221,209
+       xorps   %xmm7,%xmm2
+       movdqa  %xmm7,%xmm0
+       movdqa  %xmm6,%xmm1
+       movups  %xmm2,(%edi,%esi,1)
+       jmp     .L078done
+.align 16
+.L080two:
+       leal    1(%ebp),%ecx
+       movl    112(%esp),%edx
+       bsfl    %ecx,%ecx
+       shll    $4,%ecx
+       movdqu  (%ebx),%xmm6
+       movdqu  (%ebx,%ecx,1),%xmm7
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movl    240(%edx),%ecx
+       pxor    %xmm0,%xmm6
+       pxor    %xmm6,%xmm7
+       pxor    %xmm2,%xmm1
+       pxor    %xmm6,%xmm2
+       pxor    %xmm3,%xmm1
+       pxor    %xmm7,%xmm3
+       movdqa  %xmm1,%xmm5
+       movl    120(%esp),%edi
+       call    _aesni_encrypt2
+       xorps   %xmm6,%xmm2
+       xorps   %xmm7,%xmm3
+       movdqa  %xmm7,%xmm0
+       movdqa  %xmm5,%xmm1
+       movups  %xmm2,(%edi,%esi,1)
+       movups  %xmm3,16(%edi,%esi,1)
+       jmp     .L078done
+.align 16
+.L081three:
+       leal    1(%ebp),%ecx
+       movl    112(%esp),%edx
+       bsfl    %ecx,%ecx
+       shll    $4,%ecx
+       movdqu  (%ebx),%xmm5
+       movdqu  (%ebx,%ecx,1),%xmm6
+       movdqa  %xmm5,%xmm7
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movl    240(%edx),%ecx
+       pxor    %xmm0,%xmm5
+       pxor    %xmm5,%xmm6
+       pxor    %xmm6,%xmm7
+       pxor    %xmm2,%xmm1
+       pxor    %xmm5,%xmm2
+       pxor    %xmm3,%xmm1
+       pxor    %xmm6,%xmm3
+       pxor    %xmm4,%xmm1
+       pxor    %xmm7,%xmm4
+       movdqa  %xmm1,96(%esp)
+       movl    120(%esp),%edi
+       call    _aesni_encrypt3
+       xorps   %xmm5,%xmm2
+       xorps   %xmm6,%xmm3
+       xorps   %xmm7,%xmm4
+       movdqa  %xmm7,%xmm0
+       movdqa  96(%esp),%xmm1
+       movups  %xmm2,(%edi,%esi,1)
+       movups  %xmm3,16(%edi,%esi,1)
+       movups  %xmm4,32(%edi,%esi,1)
+       jmp     .L078done
+.align 16
+.L082four:
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       movl    112(%esp),%edx
+       shll    $4,%ecx
+       shll    $4,%eax
+       movdqu  (%ebx),%xmm4
+       movdqu  (%ebx,%ecx,1),%xmm5
+       movdqa  %xmm4,%xmm6
+       movdqu  (%ebx,%eax,1),%xmm7
+       pxor    %xmm0,%xmm4
+       movdqu  (%esi),%xmm2
+       pxor    %xmm4,%xmm5
+       movdqu  16(%esi),%xmm3
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm4,(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm5,16(%esp)
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movl    240(%edx),%ecx
+       pxor    %xmm2,%xmm1
+       pxor    (%esp),%xmm2
+       pxor    %xmm3,%xmm1
+       pxor    16(%esp),%xmm3
+       pxor    %xmm4,%xmm1
+       pxor    %xmm6,%xmm4
+       pxor    %xmm5,%xmm1
+       pxor    %xmm7,%xmm5
+       movdqa  %xmm1,96(%esp)
+       movl    120(%esp),%edi
+       call    _aesni_encrypt4
+       xorps   (%esp),%xmm2
+       xorps   16(%esp),%xmm3
+       xorps   %xmm6,%xmm4
+       movups  %xmm2,(%edi,%esi,1)
+       xorps   %xmm7,%xmm5
+       movups  %xmm3,16(%edi,%esi,1)
+       movdqa  %xmm7,%xmm0
+       movups  %xmm4,32(%edi,%esi,1)
+       movdqa  96(%esp),%xmm1
+       movups  %xmm5,48(%edi,%esi,1)
+.L078done:
+       movl    128(%esp),%edx
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm2,16(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm2,32(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm2,48(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm2,64(%esp)
+       movdqa  %xmm2,80(%esp)
+       movdqa  %xmm2,96(%esp)
+       leal    (%edx),%esp
+       movl    40(%esp),%ecx
+       movl    48(%esp),%ebx
+       movdqu  %xmm0,(%ecx)
+       pxor    %xmm0,%xmm0
+       movdqu  %xmm1,(%ebx)
+       pxor    %xmm1,%xmm1
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.size  aesni_ocb_encrypt,.-.L_aesni_ocb_encrypt_begin
+.globl aesni_ocb_decrypt
+.type  aesni_ocb_decrypt,@function
+.align 16
+aesni_ocb_decrypt:
+.L_aesni_ocb_decrypt_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       movl    40(%esp),%ecx
+       movl    48(%esp),%ebx
+       movl    20(%esp),%esi
+       movl    24(%esp),%edi
+       movl    28(%esp),%eax
+       movl    32(%esp),%edx
+       movdqu  (%ecx),%xmm0
+       movl    36(%esp),%ebp
+       movdqu  (%ebx),%xmm1
+       movl    44(%esp),%ebx
+       movl    %esp,%ecx
+       subl    $132,%esp
+       andl    $-16,%esp
+       subl    %esi,%edi
+       shll    $4,%eax
+       leal    -96(%esi,%eax,1),%eax
+       movl    %edi,120(%esp)
+       movl    %eax,124(%esp)
+       movl    %ecx,128(%esp)
+       movl    240(%edx),%ecx
+       testl   $1,%ebp
+       jnz     .L084odd
+       bsfl    %ebp,%eax
+       addl    $1,%ebp
+       shll    $4,%eax
+       movdqu  (%ebx,%eax,1),%xmm7
+       movl    %edx,%eax
+       movdqu  (%esi),%xmm2
+       leal    16(%esi),%esi
+       pxor    %xmm0,%xmm7
+       pxor    %xmm7,%xmm2
+       movdqa  %xmm1,%xmm6
+       movups  (%edx),%xmm0
+       movups  16(%edx),%xmm1
+       leal    32(%edx),%edx
+       xorps   %xmm0,%xmm2
+.L085dec1_loop_17:
+.byte  102,15,56,222,209
+       decl    %ecx
+       movups  (%edx),%xmm1
+       leal    16(%edx),%edx
+       jnz     .L085dec1_loop_17
+.byte  102,15,56,223,209
+       xorps   %xmm7,%xmm2
+       movaps  %xmm6,%xmm1
+       movdqa  %xmm7,%xmm0
+       xorps   %xmm2,%xmm1
+       movups  %xmm2,-16(%edi,%esi,1)
+       movl    240(%eax),%ecx
+       movl    %eax,%edx
+       movl    124(%esp),%eax
+.L084odd:
+       shll    $4,%ecx
+       movl    $16,%edi
+       subl    %ecx,%edi
+       movl    %edx,112(%esp)
+       leal    32(%edx,%ecx,1),%edx
+       movl    %edi,116(%esp)
+       cmpl    %eax,%esi
+       ja      .L086short
+       jmp     .L087grandloop
+.align 32
+.L087grandloop:
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       leal    5(%ebp),%edi
+       addl    $6,%ebp
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       bsfl    %edi,%edi
+       shll    $4,%ecx
+       shll    $4,%eax
+       shll    $4,%edi
+       movdqu  (%ebx),%xmm2
+       movdqu  (%ebx,%ecx,1),%xmm3
+       movl    116(%esp),%ecx
+       movdqa  %xmm2,%xmm4
+       movdqu  (%ebx,%eax,1),%xmm5
+       movdqa  %xmm2,%xmm6
+       movdqu  (%ebx,%edi,1),%xmm7
+       pxor    %xmm0,%xmm2
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm3,%xmm4
+       movdqa  %xmm3,16(%esp)
+       pxor    %xmm4,%xmm5
+       movdqa  %xmm4,32(%esp)
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm5,48(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm6,64(%esp)
+       movdqa  %xmm7,80(%esp)
+       movups  -48(%edx,%ecx,1),%xmm0
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movdqu  64(%esi),%xmm6
+       movdqu  80(%esi),%xmm7
+       leal    96(%esi),%esi
+       movdqa  %xmm1,96(%esp)
+       pxor    %xmm0,%xmm2
+       pxor    %xmm0,%xmm3
+       pxor    %xmm0,%xmm4
+       pxor    %xmm0,%xmm5
+       pxor    %xmm0,%xmm6
+       pxor    %xmm0,%xmm7
+       movups  -32(%edx,%ecx,1),%xmm1
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       pxor    80(%esp),%xmm7
+       movups  -16(%edx,%ecx,1),%xmm0
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+.byte  102,15,56,222,241
+.byte  102,15,56,222,249
+       movl    120(%esp),%edi
+       movl    124(%esp),%eax
+       call    .L_aesni_decrypt6_enter
+       movdqa  80(%esp),%xmm0
+       pxor    (%esp),%xmm2
+       movdqa  96(%esp),%xmm1
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       pxor    %xmm0,%xmm7
+       pxor    %xmm2,%xmm1
+       movdqu  %xmm2,-96(%edi,%esi,1)
+       pxor    %xmm3,%xmm1
+       movdqu  %xmm3,-80(%edi,%esi,1)
+       pxor    %xmm4,%xmm1
+       movdqu  %xmm4,-64(%edi,%esi,1)
+       pxor    %xmm5,%xmm1
+       movdqu  %xmm5,-48(%edi,%esi,1)
+       pxor    %xmm6,%xmm1
+       movdqu  %xmm6,-32(%edi,%esi,1)
+       pxor    %xmm7,%xmm1
+       movdqu  %xmm7,-16(%edi,%esi,1)
+       cmpl    %eax,%esi
+       jb      .L087grandloop
+.L086short:
+       addl    $96,%eax
+       subl    %esi,%eax
+       jz      .L088done
+       cmpl    $32,%eax
+       jb      .L089one
+       je      .L090two
+       cmpl    $64,%eax
+       jb      .L091three
+       je      .L092four
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       shll    $4,%ecx
+       shll    $4,%eax
+       movdqu  (%ebx),%xmm2
+       movdqu  (%ebx,%ecx,1),%xmm3
+       movl    116(%esp),%ecx
+       movdqa  %xmm2,%xmm4
+       movdqu  (%ebx,%eax,1),%xmm5
+       movdqa  %xmm2,%xmm6
+       pxor    %xmm0,%xmm2
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm3,%xmm4
+       movdqa  %xmm3,16(%esp)
+       pxor    %xmm4,%xmm5
+       movdqa  %xmm4,32(%esp)
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm5,48(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm6,64(%esp)
+       movups  -48(%edx,%ecx,1),%xmm0
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movdqu  64(%esi),%xmm6
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm1,96(%esp)
+       pxor    %xmm0,%xmm2
+       pxor    %xmm0,%xmm3
+       pxor    %xmm0,%xmm4
+       pxor    %xmm0,%xmm5
+       pxor    %xmm0,%xmm6
+       movups  -32(%edx,%ecx,1),%xmm1
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       movups  -16(%edx,%ecx,1),%xmm0
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+.byte  102,15,56,222,241
+.byte  102,15,56,222,249
+       movl    120(%esp),%edi
+       call    .L_aesni_decrypt6_enter
+       movdqa  64(%esp),%xmm0
+       pxor    (%esp),%xmm2
+       movdqa  96(%esp),%xmm1
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    %xmm0,%xmm6
+       pxor    %xmm2,%xmm1
+       movdqu  %xmm2,(%edi,%esi,1)
+       pxor    %xmm3,%xmm1
+       movdqu  %xmm3,16(%edi,%esi,1)
+       pxor    %xmm4,%xmm1
+       movdqu  %xmm4,32(%edi,%esi,1)
+       pxor    %xmm5,%xmm1
+       movdqu  %xmm5,48(%edi,%esi,1)
+       pxor    %xmm6,%xmm1
+       movdqu  %xmm6,64(%edi,%esi,1)
+       jmp     .L088done
+.align 16
+.L089one:
+       movdqu  (%ebx),%xmm7
+       movl    112(%esp),%edx
+       movdqu  (%esi),%xmm2
+       movl    240(%edx),%ecx
+       pxor    %xmm0,%xmm7
+       pxor    %xmm7,%xmm2
+       movdqa  %xmm1,%xmm6
+       movl    120(%esp),%edi
+       movups  (%edx),%xmm0
+       movups  16(%edx),%xmm1
+       leal    32(%edx),%edx
+       xorps   %xmm0,%xmm2
+.L093dec1_loop_18:
+.byte  102,15,56,222,209
+       decl    %ecx
+       movups  (%edx),%xmm1
+       leal    16(%edx),%edx
+       jnz     .L093dec1_loop_18
+.byte  102,15,56,223,209
+       xorps   %xmm7,%xmm2
+       movaps  %xmm6,%xmm1
+       movdqa  %xmm7,%xmm0
+       xorps   %xmm2,%xmm1
+       movups  %xmm2,(%edi,%esi,1)
+       jmp     .L088done
+.align 16
+.L090two:
+       leal    1(%ebp),%ecx
+       movl    112(%esp),%edx
+       bsfl    %ecx,%ecx
+       shll    $4,%ecx
+       movdqu  (%ebx),%xmm6
+       movdqu  (%ebx,%ecx,1),%xmm7
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movl    240(%edx),%ecx
+       movdqa  %xmm1,%xmm5
+       pxor    %xmm0,%xmm6
+       pxor    %xmm6,%xmm7
+       pxor    %xmm6,%xmm2
+       pxor    %xmm7,%xmm3
+       movl    120(%esp),%edi
+       call    _aesni_decrypt2
+       xorps   %xmm6,%xmm2
+       xorps   %xmm7,%xmm3
+       movdqa  %xmm7,%xmm0
+       xorps   %xmm2,%xmm5
+       movups  %xmm2,(%edi,%esi,1)
+       xorps   %xmm3,%xmm5
+       movups  %xmm3,16(%edi,%esi,1)
+       movaps  %xmm5,%xmm1
+       jmp     .L088done
+.align 16
+.L091three:
+       leal    1(%ebp),%ecx
+       movl    112(%esp),%edx
+       bsfl    %ecx,%ecx
+       shll    $4,%ecx
+       movdqu  (%ebx),%xmm5
+       movdqu  (%ebx,%ecx,1),%xmm6
+       movdqa  %xmm5,%xmm7
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movl    240(%edx),%ecx
+       movdqa  %xmm1,96(%esp)
+       pxor    %xmm0,%xmm5
+       pxor    %xmm5,%xmm6
+       pxor    %xmm6,%xmm7
+       pxor    %xmm5,%xmm2
+       pxor    %xmm6,%xmm3
+       pxor    %xmm7,%xmm4
+       movl    120(%esp),%edi
+       call    _aesni_decrypt3
+       movdqa  96(%esp),%xmm1
+       xorps   %xmm5,%xmm2
+       xorps   %xmm6,%xmm3
+       xorps   %xmm7,%xmm4
+       movups  %xmm2,(%edi,%esi,1)
+       pxor    %xmm2,%xmm1
+       movdqa  %xmm7,%xmm0
+       movups  %xmm3,16(%edi,%esi,1)
+       pxor    %xmm3,%xmm1
+       movups  %xmm4,32(%edi,%esi,1)
+       pxor    %xmm4,%xmm1
+       jmp     .L088done
+.align 16
+.L092four:
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       movl    112(%esp),%edx
+       shll    $4,%ecx
+       shll    $4,%eax
+       movdqu  (%ebx),%xmm4
+       movdqu  (%ebx,%ecx,1),%xmm5
+       movdqa  %xmm4,%xmm6
+       movdqu  (%ebx,%eax,1),%xmm7
+       pxor    %xmm0,%xmm4
+       movdqu  (%esi),%xmm2
+       pxor    %xmm4,%xmm5
+       movdqu  16(%esi),%xmm3
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm4,(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm5,16(%esp)
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movl    240(%edx),%ecx
+       movdqa  %xmm1,96(%esp)
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    %xmm6,%xmm4
+       pxor    %xmm7,%xmm5
+       movl    120(%esp),%edi
+       call    _aesni_decrypt4
+       movdqa  96(%esp),%xmm1
+       xorps   (%esp),%xmm2
+       xorps   16(%esp),%xmm3
+       xorps   %xmm6,%xmm4
+       movups  %xmm2,(%edi,%esi,1)
+       pxor    %xmm2,%xmm1
+       xorps   %xmm7,%xmm5
+       movups  %xmm3,16(%edi,%esi,1)
+       pxor    %xmm3,%xmm1
+       movdqa  %xmm7,%xmm0
+       movups  %xmm4,32(%edi,%esi,1)
+       pxor    %xmm4,%xmm1
+       movups  %xmm5,48(%edi,%esi,1)
+       pxor    %xmm5,%xmm1
+.L088done:
+       movl    128(%esp),%edx
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm2,16(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm2,32(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm2,48(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm2,64(%esp)
+       movdqa  %xmm2,80(%esp)
+       movdqa  %xmm2,96(%esp)
+       leal    (%edx),%esp
+       movl    40(%esp),%ecx
+       movl    48(%esp),%ebx
+       movdqu  %xmm0,(%ecx)
+       pxor    %xmm0,%xmm0
+       movdqu  %xmm1,(%ebx)
+       pxor    %xmm1,%xmm1
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.size  aesni_ocb_decrypt,.-.L_aesni_ocb_decrypt_begin
+.globl aesni_cbc_encrypt
+.type  aesni_cbc_encrypt,@function
+.align 16
+aesni_cbc_encrypt:
+.L_aesni_cbc_encrypt_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       movl    20(%esp),%esi
+       movl    %esp,%ebx
+       movl    24(%esp),%edi
+       subl    $24,%ebx
+       movl    28(%esp),%eax
+       andl    $-16,%ebx
+       movl    32(%esp),%edx
+       movl    36(%esp),%ebp
+       testl   %eax,%eax
+       jz      .L094cbc_abort
+       cmpl    $0,40(%esp)
+       xchgl   %esp,%ebx
+       movups  (%ebp),%xmm7
+       movl    240(%edx),%ecx
+       movl    %edx,%ebp
+       movl    %ebx,16(%esp)
+       movl    %ecx,%ebx
+       je      .L095cbc_decrypt
+       movaps  %xmm7,%xmm2
+       cmpl    $16,%eax
+       jb      .L096cbc_enc_tail
+       subl    $16,%eax
+       jmp     .L097cbc_enc_loop
+.align 16
+.L097cbc_enc_loop:
+       movups  (%esi),%xmm7
+       leal    16(%esi),%esi
+       movups  (%edx),%xmm0
+       movups  16(%edx),%xmm1
+       xorps   %xmm0,%xmm7
+       leal    32(%edx),%edx
+       xorps   %xmm7,%xmm2
+.L098enc1_loop_19:
+.byte  102,15,56,220,209
+       decl    %ecx
+       movups  (%edx),%xmm1
+       leal    16(%edx),%edx
+       jnz     .L098enc1_loop_19
+.byte  102,15,56,221,209
+       movl    %ebx,%ecx
+       movl    %ebp,%edx
+       movups  %xmm2,(%edi)
        leal    16(%edi),%edi
-.L077cbc_dec_loop6_enter:
+       subl    $16,%eax
+       jnc     .L097cbc_enc_loop
+       addl    $16,%eax
+       jnz     .L096cbc_enc_tail
+       movaps  %xmm2,%xmm7
+       pxor    %xmm2,%xmm2
+       jmp     .L099cbc_ret
+.L096cbc_enc_tail:
+       movl    %eax,%ecx
+.long  2767451785
+       movl    $16,%ecx
+       subl    %eax,%ecx
+       xorl    %eax,%eax
+.long  2868115081
+       leal    -16(%edi),%edi
+       movl    %ebx,%ecx
+       movl    %edi,%esi
+       movl    %ebp,%edx
+       jmp     .L097cbc_enc_loop
+.align 16
+.L095cbc_decrypt:
+       cmpl    $80,%eax
+       jbe     .L100cbc_dec_tail
+       movaps  %xmm7,(%esp)
+       subl    $80,%eax
+       jmp     .L101cbc_dec_loop6_enter
+.align 16
+.L102cbc_dec_loop6:
+       movaps  %xmm0,(%esp)
+       movups  %xmm7,(%edi)
+       leal    16(%edi),%edi
+.L101cbc_dec_loop6_enter:
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -1822,28 +2731,28 @@ aesni_cbc_encrypt:
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
        subl    $96,%eax
-       ja      .L078cbc_dec_loop6
+       ja      .L102cbc_dec_loop6
        movaps  %xmm7,%xmm2
        movaps  %xmm0,%xmm7
        addl    $80,%eax
-       jle     .L079cbc_dec_tail_collected
+       jle     .L103cbc_dec_clear_tail_collected
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
-.L076cbc_dec_tail:
+.L100cbc_dec_tail:
        movups  (%esi),%xmm2
        movaps  %xmm2,%xmm6
        cmpl    $16,%eax
-       jbe     .L080cbc_dec_one
+       jbe     .L104cbc_dec_one
        movups  16(%esi),%xmm3
        movaps  %xmm3,%xmm5
        cmpl    $32,%eax
-       jbe     .L081cbc_dec_two
+       jbe     .L105cbc_dec_two
        movups  32(%esi),%xmm4
        cmpl    $48,%eax
-       jbe     .L082cbc_dec_three
+       jbe     .L106cbc_dec_three
        movups  48(%esi),%xmm5
        cmpl    $64,%eax
-       jbe     .L083cbc_dec_four
+       jbe     .L107cbc_dec_four
        movups  64(%esi),%xmm6
        movaps  %xmm7,(%esp)
        movups  (%esi),%xmm2
@@ -1861,56 +2770,62 @@ aesni_cbc_encrypt:
        xorps   %xmm0,%xmm6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%edi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%edi)
+       pxor    %xmm5,%xmm5
        leal    64(%edi),%edi
        movaps  %xmm6,%xmm2
+       pxor    %xmm6,%xmm6
        subl    $80,%eax
-       jmp     .L079cbc_dec_tail_collected
+       jmp     .L108cbc_dec_tail_collected
 .align 16
-.L080cbc_dec_one:
+.L104cbc_dec_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L084dec1_loop_16:
+.L109dec1_loop_20:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L084dec1_loop_16
+       jnz     .L109dec1_loop_20
 .byte  102,15,56,223,209
        xorps   %xmm7,%xmm2
        movaps  %xmm6,%xmm7
        subl    $16,%eax
-       jmp     .L079cbc_dec_tail_collected
+       jmp     .L108cbc_dec_tail_collected
 .align 16
-.L081cbc_dec_two:
-       xorps   %xmm4,%xmm4
-       call    _aesni_decrypt3
+.L105cbc_dec_two:
+       call    _aesni_decrypt2
        xorps   %xmm7,%xmm2
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movaps  %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
        leal    16(%edi),%edi
        movaps  %xmm5,%xmm7
        subl    $32,%eax
-       jmp     .L079cbc_dec_tail_collected
+       jmp     .L108cbc_dec_tail_collected
 .align 16
-.L082cbc_dec_three:
+.L106cbc_dec_three:
        call    _aesni_decrypt3
        xorps   %xmm7,%xmm2
        xorps   %xmm6,%xmm3
        xorps   %xmm5,%xmm4
        movups  %xmm2,(%edi)
        movaps  %xmm4,%xmm2
+       pxor    %xmm4,%xmm4
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        leal    32(%edi),%edi
        movups  32(%esi),%xmm7
        subl    $48,%eax
-       jmp     .L079cbc_dec_tail_collected
+       jmp     .L108cbc_dec_tail_collected
 .align 16
-.L083cbc_dec_four:
+.L107cbc_dec_four:
        call    _aesni_decrypt4
        movups  16(%esi),%xmm1
        movups  32(%esi),%xmm0
@@ -1920,28 +2835,44 @@ aesni_cbc_encrypt:
        movups  %xmm2,(%edi)
        xorps   %xmm1,%xmm4
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        xorps   %xmm0,%xmm5
        movups  %xmm4,32(%edi)
+       pxor    %xmm4,%xmm4
        leal    48(%edi),%edi
        movaps  %xmm5,%xmm2
+       pxor    %xmm5,%xmm5
        subl    $64,%eax
-.L079cbc_dec_tail_collected:
+       jmp     .L108cbc_dec_tail_collected
+.align 16
+.L103cbc_dec_clear_tail_collected:
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+.L108cbc_dec_tail_collected:
        andl    $15,%eax
-       jnz     .L085cbc_dec_tail_partial
+       jnz     .L110cbc_dec_tail_partial
        movups  %xmm2,(%edi)
-       jmp     .L075cbc_ret
+       pxor    %xmm0,%xmm0
+       jmp     .L099cbc_ret
 .align 16
-.L085cbc_dec_tail_partial:
+.L110cbc_dec_tail_partial:
        movaps  %xmm2,(%esp)
+       pxor    %xmm0,%xmm0
        movl    $16,%ecx
        movl    %esp,%esi
        subl    %eax,%ecx
 .long  2767451785
-.L075cbc_ret:
+       movdqa  %xmm2,(%esp)
+.L099cbc_ret:
        movl    16(%esp),%esp
        movl    36(%esp),%ebp
+       pxor    %xmm2,%xmm2
+       pxor    %xmm1,%xmm1
        movups  %xmm7,(%ebp)
-.L070cbc_abort:
+       pxor    %xmm7,%xmm7
+.L094cbc_abort:
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -1951,52 +2882,62 @@ aesni_cbc_encrypt:
 .type  _aesni_set_encrypt_key,@function
 .align 16
 _aesni_set_encrypt_key:
+       pushl   %ebp
+       pushl   %ebx
        testl   %eax,%eax
-       jz      .L086bad_pointer
+       jz      .L111bad_pointer
        testl   %edx,%edx
-       jz      .L086bad_pointer
+       jz      .L111bad_pointer
+       call    .L112pic
+.L112pic:
+       popl    %ebx
+       leal    .Lkey_const-.L112pic(%ebx),%ebx
+       leal    _gnutls_x86_cpuid_s,%ebp
        movups  (%eax),%xmm0
        xorps   %xmm4,%xmm4
+       movl    4(%ebp),%ebp
        leal    16(%edx),%edx
+       andl    $268437504,%ebp
        cmpl    $256,%ecx
-       je      .L08714rounds
+       je      .L11314rounds
        cmpl    $192,%ecx
-       je      .L08812rounds
+       je      .L11412rounds
        cmpl    $128,%ecx
-       jne     .L089bad_keybits
+       jne     .L115bad_keybits
 .align 16
-.L09010rounds:
+.L11610rounds:
+       cmpl    $268435456,%ebp
+       je      .L11710rounds_alt
        movl    $9,%ecx
        movups  %xmm0,-16(%edx)
 .byte  102,15,58,223,200,1
-       call    .L091key_128_cold
+       call    .L118key_128_cold
 .byte  102,15,58,223,200,2
-       call    .L092key_128
+       call    .L119key_128
 .byte  102,15,58,223,200,4
-       call    .L092key_128
+       call    .L119key_128
 .byte  102,15,58,223,200,8
-       call    .L092key_128
+       call    .L119key_128
 .byte  102,15,58,223,200,16
-       call    .L092key_128
+       call    .L119key_128
 .byte  102,15,58,223,200,32
-       call    .L092key_128
+       call    .L119key_128
 .byte  102,15,58,223,200,64
-       call    .L092key_128
+       call    .L119key_128
 .byte  102,15,58,223,200,128
-       call    .L092key_128
+       call    .L119key_128
 .byte  102,15,58,223,200,27
-       call    .L092key_128
+       call    .L119key_128
 .byte  102,15,58,223,200,54
-       call    .L092key_128
+       call    .L119key_128
        movups  %xmm0,(%edx)
        movl    %ecx,80(%edx)
-       xorl    %eax,%eax
-       ret
+       jmp     .L120good_key
 .align 16
-.L092key_128:
+.L119key_128:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
-.L091key_128_cold:
+.L118key_128_cold:
        shufps  $16,%xmm0,%xmm4
        xorps   %xmm4,%xmm0
        shufps  $140,%xmm0,%xmm4
@@ -2005,38 +2946,91 @@ _aesni_set_encrypt_key:
        xorps   %xmm1,%xmm0
        ret
 .align 16
-.L08812rounds:
+.L11710rounds_alt:
+       movdqa  (%ebx),%xmm5
+       movl    $8,%ecx
+       movdqa  32(%ebx),%xmm4
+       movdqa  %xmm0,%xmm2
+       movdqu  %xmm0,-16(%edx)
+.L121loop_key128:
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       leal    16(%edx),%edx
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,-16(%edx)
+       movdqa  %xmm0,%xmm2
+       decl    %ecx
+       jnz     .L121loop_key128
+       movdqa  48(%ebx),%xmm4
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%edx)
+       movdqa  %xmm0,%xmm2
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,16(%edx)
+       movl    $9,%ecx
+       movl    %ecx,96(%edx)
+       jmp     .L120good_key
+.align 16
+.L11412rounds:
        movq    16(%eax),%xmm2
+       cmpl    $268435456,%ebp
+       je      .L12212rounds_alt
        movl    $11,%ecx
        movups  %xmm0,-16(%edx)
 .byte  102,15,58,223,202,1
-       call    .L093key_192a_cold
+       call    .L123key_192a_cold
 .byte  102,15,58,223,202,2
-       call    .L094key_192b
+       call    .L124key_192b
 .byte  102,15,58,223,202,4
-       call    .L095key_192a
+       call    .L125key_192a
 .byte  102,15,58,223,202,8
-       call    .L094key_192b
+       call    .L124key_192b
 .byte  102,15,58,223,202,16
-       call    .L095key_192a
+       call    .L125key_192a
 .byte  102,15,58,223,202,32
-       call    .L094key_192b
+       call    .L124key_192b
 .byte  102,15,58,223,202,64
-       call    .L095key_192a
+       call    .L125key_192a
 .byte  102,15,58,223,202,128
-       call    .L094key_192b
+       call    .L124key_192b
        movups  %xmm0,(%edx)
        movl    %ecx,48(%edx)
-       xorl    %eax,%eax
-       ret
+       jmp     .L120good_key
 .align 16
-.L095key_192a:
+.L125key_192a:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
 .align 16
-.L093key_192a_cold:
+.L123key_192a_cold:
        movaps  %xmm2,%xmm5
-.L096key_192b_warm:
+.L126key_192b_warm:
        shufps  $16,%xmm0,%xmm4
        movdqa  %xmm2,%xmm3
        xorps   %xmm4,%xmm0
@@ -2050,56 +3044,90 @@ _aesni_set_encrypt_key:
        pxor    %xmm3,%xmm2
        ret
 .align 16
-.L094key_192b:
+.L124key_192b:
        movaps  %xmm0,%xmm3
        shufps  $68,%xmm0,%xmm5
        movups  %xmm5,(%edx)
        shufps  $78,%xmm2,%xmm3
        movups  %xmm3,16(%edx)
        leal    32(%edx),%edx
-       jmp     .L096key_192b_warm
+       jmp     .L126key_192b_warm
+.align 16
+.L12212rounds_alt:
+       movdqa  16(%ebx),%xmm5
+       movdqa  32(%ebx),%xmm4
+       movl    $8,%ecx
+       movdqu  %xmm0,-16(%edx)
+.L127loop_key192:
+       movq    %xmm2,(%edx)
+       movdqa  %xmm2,%xmm1
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       pslld   $1,%xmm4
+       leal    24(%edx),%edx
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pshufd  $255,%xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pxor    %xmm2,%xmm0
+       pxor    %xmm3,%xmm2
+       movdqu  %xmm0,-16(%edx)
+       decl    %ecx
+       jnz     .L127loop_key192
+       movl    $11,%ecx
+       movl    %ecx,32(%edx)
+       jmp     .L120good_key
 .align 16
-.L08714rounds:
+.L11314rounds:
        movups  16(%eax),%xmm2
-       movl    $13,%ecx
        leal    16(%edx),%edx
+       cmpl    $268435456,%ebp
+       je      .L12814rounds_alt
+       movl    $13,%ecx
        movups  %xmm0,-32(%edx)
        movups  %xmm2,-16(%edx)
 .byte  102,15,58,223,202,1
-       call    .L097key_256a_cold
+       call    .L129key_256a_cold
 .byte  102,15,58,223,200,1
-       call    .L098key_256b
+       call    .L130key_256b
 .byte  102,15,58,223,202,2
-       call    .L099key_256a
+       call    .L131key_256a
 .byte  102,15,58,223,200,2
-       call    .L098key_256b
+       call    .L130key_256b
 .byte  102,15,58,223,202,4
-       call    .L099key_256a
+       call    .L131key_256a
 .byte  102,15,58,223,200,4
-       call    .L098key_256b
+       call    .L130key_256b
 .byte  102,15,58,223,202,8
-       call    .L099key_256a
+       call    .L131key_256a
 .byte  102,15,58,223,200,8
-       call    .L098key_256b
+       call    .L130key_256b
 .byte  102,15,58,223,202,16
-       call    .L099key_256a
+       call    .L131key_256a
 .byte  102,15,58,223,200,16
-       call    .L098key_256b
+       call    .L130key_256b
 .byte  102,15,58,223,202,32
-       call    .L099key_256a
+       call    .L131key_256a
 .byte  102,15,58,223,200,32
-       call    .L098key_256b
+       call    .L130key_256b
 .byte  102,15,58,223,202,64
-       call    .L099key_256a
+       call    .L131key_256a
        movups  %xmm0,(%edx)
        movl    %ecx,16(%edx)
        xorl    %eax,%eax
-       ret
+       jmp     .L120good_key
 .align 16
-.L099key_256a:
+.L131key_256a:
        movups  %xmm2,(%edx)
        leal    16(%edx),%edx
-.L097key_256a_cold:
+.L129key_256a_cold:
        shufps  $16,%xmm0,%xmm4
        xorps   %xmm4,%xmm0
        shufps  $140,%xmm0,%xmm4
@@ -2108,7 +3136,7 @@ _aesni_set_encrypt_key:
        xorps   %xmm1,%xmm0
        ret
 .align 16
-.L098key_256b:
+.L130key_256b:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
        shufps  $16,%xmm2,%xmm4
@@ -2118,13 +3146,70 @@ _aesni_set_encrypt_key:
        shufps  $170,%xmm1,%xmm1
        xorps   %xmm1,%xmm2
        ret
+.align 16
+.L12814rounds_alt:
+       movdqa  (%ebx),%xmm5
+       movdqa  32(%ebx),%xmm4
+       movl    $7,%ecx
+       movdqu  %xmm0,-32(%edx)
+       movdqa  %xmm2,%xmm1
+       movdqu  %xmm2,-16(%edx)
+.L132loop_key256:
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pslld   $1,%xmm4
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%edx)
+       decl    %ecx
+       jz      .L133done_key256
+       pshufd  $255,%xmm0,%xmm2
+       pxor    %xmm3,%xmm3
+.byte  102,15,56,221,211
+       movdqa  %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm3,%xmm1
+       pxor    %xmm1,%xmm2
+       movdqu  %xmm2,16(%edx)
+       leal    32(%edx),%edx
+       movdqa  %xmm2,%xmm1
+       jmp     .L132loop_key256
+.L133done_key256:
+       movl    $13,%ecx
+       movl    %ecx,16(%edx)
+.L120good_key:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       xorl    %eax,%eax
+       popl    %ebx
+       popl    %ebp
+       ret
 .align 4
-.L086bad_pointer:
+.L111bad_pointer:
        movl    $-1,%eax
+       popl    %ebx
+       popl    %ebp
        ret
 .align 4
-.L089bad_keybits:
+.L115bad_keybits:
+       pxor    %xmm0,%xmm0
        movl    $-2,%eax
+       popl    %ebx
+       popl    %ebp
        ret
 .size  _aesni_set_encrypt_key,.-_aesni_set_encrypt_key
 .globl aesni_set_encrypt_key
@@ -2150,7 +3235,7 @@ aesni_set_decrypt_key:
        movl    12(%esp),%edx
        shll    $4,%ecx
        testl   %eax,%eax
-       jnz     .L100dec_key_ret
+       jnz     .L134dec_key_ret
        leal    16(%edx,%ecx,1),%eax
        movups  (%edx),%xmm0
        movups  (%eax),%xmm1
@@ -2158,7 +3243,7 @@ aesni_set_decrypt_key:
        movups  %xmm1,(%edx)
        leal    16(%edx),%edx
        leal    -16(%eax),%eax
-.L101dec_key_inverse:
+.L135dec_key_inverse:
        movups  (%edx),%xmm0
        movups  (%eax),%xmm1
 .byte  102,15,56,219,192
@@ -2168,20 +3253,26 @@ aesni_set_decrypt_key:
        movups  %xmm0,16(%eax)
        movups  %xmm1,-16(%edx)
        cmpl    %edx,%eax
-       ja      .L101dec_key_inverse
+       ja      .L135dec_key_inverse
        movups  (%edx),%xmm0
 .byte  102,15,56,219,192
        movups  %xmm0,(%edx)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        xorl    %eax,%eax
-.L100dec_key_ret:
+.L134dec_key_ret:
        ret
 .size  aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin
+.align 64
+.Lkey_const:
+.long  202313229,202313229,202313229,202313229
+.long  67569157,67569157,67569157,67569157
+.long  1,1,1,1
+.long  27,27,27,27
 .byte  65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
 .byte  83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
 .byte  32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
 .byte  115,108,46,111,114,103,62,0
-
+.comm  _gnutls_x86_cpuid_s,16,4
 
 .section .note.GNU-stack,"",%progbits
-
-
index 76d44fc2a8b5db43e70d62c525adee06e34229e6..43cf4e68defaae6b23414d70537a2ff3aa7314a4 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -43,6 +43,7 @@
 .type  aesni_encrypt,@function
 .align 16
 aesni_encrypt:
+.cfi_startproc 
        movups  (%rdi),%xmm2
        movl    240(%rdx),%eax
        movups  (%rdx),%xmm0
@@ -61,12 +62,14 @@ aesni_encrypt:
        movups  %xmm2,(%rsi)
        pxor    %xmm2,%xmm2
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  aesni_encrypt,.-aesni_encrypt
 
 .globl aesni_decrypt
 .type  aesni_decrypt,@function
 .align 16
 aesni_decrypt:
+.cfi_startproc 
        movups  (%rdi),%xmm2
        movl    240(%rdx),%eax
        movups  (%rdx),%xmm0
@@ -85,10 +88,12 @@ aesni_decrypt:
        movups  %xmm2,(%rsi)
        pxor    %xmm2,%xmm2
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  aesni_decrypt, .-aesni_decrypt
 .type  _aesni_encrypt2,@function
 .align 16
 _aesni_encrypt2:
+.cfi_startproc 
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -114,10 +119,12 @@ _aesni_encrypt2:
 .byte  102,15,56,221,208
 .byte  102,15,56,221,216
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _aesni_encrypt2,.-_aesni_encrypt2
 .type  _aesni_decrypt2,@function
 .align 16
 _aesni_decrypt2:
+.cfi_startproc 
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -143,10 +150,12 @@ _aesni_decrypt2:
 .byte  102,15,56,223,208
 .byte  102,15,56,223,216
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _aesni_decrypt2,.-_aesni_decrypt2
 .type  _aesni_encrypt3,@function
 .align 16
 _aesni_encrypt3:
+.cfi_startproc 
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -177,10 +186,12 @@ _aesni_encrypt3:
 .byte  102,15,56,221,216
 .byte  102,15,56,221,224
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _aesni_encrypt3,.-_aesni_encrypt3
 .type  _aesni_decrypt3,@function
 .align 16
 _aesni_decrypt3:
+.cfi_startproc 
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -211,10 +222,12 @@ _aesni_decrypt3:
 .byte  102,15,56,223,216
 .byte  102,15,56,223,224
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _aesni_decrypt3,.-_aesni_decrypt3
 .type  _aesni_encrypt4,@function
 .align 16
 _aesni_encrypt4:
+.cfi_startproc 
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -251,10 +264,12 @@ _aesni_encrypt4:
 .byte  102,15,56,221,224
 .byte  102,15,56,221,232
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _aesni_encrypt4,.-_aesni_encrypt4
 .type  _aesni_decrypt4,@function
 .align 16
 _aesni_decrypt4:
+.cfi_startproc 
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -291,10 +306,12 @@ _aesni_decrypt4:
 .byte  102,15,56,223,224
 .byte  102,15,56,223,232
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _aesni_decrypt4,.-_aesni_decrypt4
 .type  _aesni_encrypt6,@function
 .align 16
 _aesni_encrypt6:
+.cfi_startproc 
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -345,10 +362,12 @@ _aesni_encrypt6:
 .byte  102,15,56,221,240
 .byte  102,15,56,221,248
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _aesni_encrypt6,.-_aesni_encrypt6
 .type  _aesni_decrypt6,@function
 .align 16
 _aesni_decrypt6:
+.cfi_startproc 
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -399,10 +418,12 @@ _aesni_decrypt6:
 .byte  102,15,56,223,240
 .byte  102,15,56,223,248
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _aesni_decrypt6,.-_aesni_decrypt6
 .type  _aesni_encrypt8,@function
 .align 16
 _aesni_encrypt8:
+.cfi_startproc 
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -463,10 +484,12 @@ _aesni_encrypt8:
 .byte  102,68,15,56,221,192
 .byte  102,68,15,56,221,200
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _aesni_encrypt8,.-_aesni_encrypt8
 .type  _aesni_decrypt8,@function
 .align 16
 _aesni_decrypt8:
+.cfi_startproc 
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -527,11 +550,13 @@ _aesni_decrypt8:
 .byte  102,68,15,56,223,192
 .byte  102,68,15,56,223,200
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  _aesni_decrypt8,.-_aesni_decrypt8
 .globl aesni_ecb_encrypt
 .type  aesni_ecb_encrypt,@function
 .align 16
 aesni_ecb_encrypt:
+.cfi_startproc 
        andq    $-16,%rdx
        jz      .Lecb_ret
 
@@ -869,6 +894,7 @@ aesni_ecb_encrypt:
        xorps   %xmm0,%xmm0
        pxor    %xmm1,%xmm1
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  aesni_ecb_encrypt,.-aesni_ecb_encrypt
 .globl aesni_ccm64_encrypt_blocks
 .type  aesni_ccm64_encrypt_blocks,@function
@@ -1034,6 +1060,7 @@ aesni_ccm64_decrypt_blocks:
 .type  aesni_ctr32_encrypt_blocks,@function
 .align 16
 aesni_ctr32_encrypt_blocks:
+.cfi_startproc 
        cmpq    $1,%rdx
        jne     .Lctr32_bulk
 
@@ -1063,11 +1090,12 @@ aesni_ctr32_encrypt_blocks:
 
 .align 16
 .Lctr32_bulk:
-       leaq    (%rsp),%rax
+       leaq    (%rsp),%r11
+.cfi_def_cfa_register  %r11
        pushq   %rbp
+.cfi_offset    %rbp,-16
        subq    $128,%rsp
        andq    $-16,%rsp
-       leaq    -8(%rax),%rbp
 
 
 
@@ -1076,7 +1104,7 @@ aesni_ctr32_encrypt_blocks:
        movdqu  (%rcx),%xmm0
        movl    12(%r8),%r8d
        pxor    %xmm0,%xmm2
-       movl    12(%rcx),%r11d
+       movl    12(%rcx),%ebp
        movdqa  %xmm2,0(%rsp)
        bswapl  %r8d
        movdqa  %xmm2,%xmm3
@@ -1092,8 +1120,8 @@ aesni_ctr32_encrypt_blocks:
        leaq    2(%r8),%rdx
        bswapl  %eax
        bswapl  %edx
-       xorl    %r11d,%eax
-       xorl    %r11d,%edx
+       xorl    %ebp,%eax
+       xorl    %ebp,%edx
 .byte  102,15,58,34,216,3
        leaq    3(%r8),%rax
        movdqa  %xmm3,16(%rsp)
@@ -1102,25 +1130,25 @@ aesni_ctr32_encrypt_blocks:
        movq    %r10,%rdx
        leaq    4(%r8),%r10
        movdqa  %xmm4,32(%rsp)
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
        bswapl  %r10d
 .byte  102,15,58,34,232,3
-       xorl    %r11d,%r10d
+       xorl    %ebp,%r10d
        movdqa  %xmm5,48(%rsp)
        leaq    5(%r8),%r9
        movl    %r10d,64+12(%rsp)
        bswapl  %r9d
        leaq    6(%r8),%r10
        movl    240(%rcx),%eax
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
        bswapl  %r10d
        movl    %r9d,80+12(%rsp)
-       xorl    %r11d,%r10d
+       xorl    %ebp,%r10d
        leaq    7(%r8),%r9
        movl    %r10d,96+12(%rsp)
        bswapl  %r9d
        movl    _gnutls_x86_cpuid_s+4(%rip),%r10d
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
        andl    $71303168,%r10d
        movl    %r9d,112+12(%rsp)
 
@@ -1144,7 +1172,7 @@ aesni_ctr32_encrypt_blocks:
 .Lctr32_6x:
        shll    $4,%eax
        movl    $48,%r10d
-       bswapl  %r11d
+       bswapl  %ebp
        leaq    32(%rcx,%rax,1),%rcx
        subq    %rax,%r10
        jmp     .Lctr32_loop6
@@ -1155,32 +1183,32 @@ aesni_ctr32_encrypt_blocks:
        movups  -48(%rcx,%r10,1),%xmm0
 .byte  102,15,56,220,209
        movl    %r8d,%eax
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 .byte  102,15,56,220,217
 .byte  0x0f,0x38,0xf1,0x44,0x24,12
        leal    1(%r8),%eax
 .byte  102,15,56,220,225
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 .byte  0x0f,0x38,0xf1,0x44,0x24,28
 .byte  102,15,56,220,233
        leal    2(%r8),%eax
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 .byte  102,15,56,220,241
 .byte  0x0f,0x38,0xf1,0x44,0x24,44
        leal    3(%r8),%eax
 .byte  102,15,56,220,249
        movups  -32(%rcx,%r10,1),%xmm1
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 
 .byte  102,15,56,220,208
 .byte  0x0f,0x38,0xf1,0x44,0x24,60
        leal    4(%r8),%eax
 .byte  102,15,56,220,216
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 .byte  0x0f,0x38,0xf1,0x44,0x24,76
 .byte  102,15,56,220,224
        leal    5(%r8),%eax
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 .byte  102,15,56,220,232
 .byte  0x0f,0x38,0xf1,0x44,0x24,92
        movq    %r10,%rax
@@ -1241,7 +1269,7 @@ aesni_ctr32_encrypt_blocks:
        bswapl  %r9d
        movups  32-128(%rcx),%xmm0
 .byte  102,15,56,220,225
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
        nop
 .byte  102,15,56,220,233
        movl    %r9d,0+12(%rsp)
@@ -1254,7 +1282,7 @@ aesni_ctr32_encrypt_blocks:
        bswapl  %r9d
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
@@ -1268,7 +1296,7 @@ aesni_ctr32_encrypt_blocks:
        bswapl  %r9d
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
@@ -1282,7 +1310,7 @@ aesni_ctr32_encrypt_blocks:
        bswapl  %r9d
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
@@ -1296,7 +1324,7 @@ aesni_ctr32_encrypt_blocks:
        bswapl  %r9d
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
@@ -1310,7 +1338,7 @@ aesni_ctr32_encrypt_blocks:
        bswapl  %r9d
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
@@ -1324,7 +1352,7 @@ aesni_ctr32_encrypt_blocks:
        bswapl  %r9d
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
@@ -1339,7 +1367,7 @@ aesni_ctr32_encrypt_blocks:
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
 .byte  102,15,56,220,224
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
        movdqu  0(%rdi),%xmm10
 .byte  102,15,56,220,232
        movl    %r9d,112+12(%rsp)
@@ -1574,7 +1602,7 @@ aesni_ctr32_encrypt_blocks:
 
 .Lctr32_done:
        xorps   %xmm0,%xmm0
-       xorl    %r11d,%r11d
+       xorl    %ebp,%ebp
        pxor    %xmm1,%xmm1
        pxor    %xmm2,%xmm2
        pxor    %xmm3,%xmm3
@@ -1598,20 +1626,25 @@ aesni_ctr32_encrypt_blocks:
        pxor    %xmm14,%xmm14
        movaps  %xmm0,112(%rsp)
        pxor    %xmm15,%xmm15
-       leaq    (%rbp),%rsp
-       popq    %rbp
+       movq    -8(%r11),%rbp
+.cfi_restore   %rbp
+       leaq    (%r11),%rsp
+.cfi_def_cfa_register  %rsp
 .Lctr32_epilogue:
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
 .globl aesni_xts_encrypt
 .type  aesni_xts_encrypt,@function
 .align 16
 aesni_xts_encrypt:
-       leaq    (%rsp),%rax
+.cfi_startproc 
+       leaq    (%rsp),%r11
+.cfi_def_cfa_register  %r11
        pushq   %rbp
+.cfi_offset    %rbp,-16
        subq    $112,%rsp
        andq    $-16,%rsp
-       leaq    -8(%rax),%rbp
        movups  (%r9),%xmm2
        movl    240(%r8),%eax
        movl    240(%rcx),%r10d
@@ -1627,7 +1660,7 @@ aesni_xts_encrypt:
        jnz     .Loop_enc1_8
 .byte  102,15,56,221,209
        movups  (%rcx),%xmm0
-       movq    %rcx,%r11
+       movq    %rcx,%rbp
        movl    %r10d,%eax
        shll    $4,%r10d
        movq    %rdx,%r9
@@ -1683,9 +1716,9 @@ aesni_xts_encrypt:
        jc      .Lxts_enc_short
 
        movl    $16+96,%eax
-       leaq    32(%r11,%r10,1),%rcx
+       leaq    32(%rbp,%r10,1),%rcx
        subq    %r10,%rax
-       movups  16(%r11),%xmm1
+       movups  16(%rbp),%xmm1
        movq    %rax,%r10
        leaq    .Lxts_magic(%rip),%r8
        jmp     .Lxts_enc_grandloop
@@ -1710,7 +1743,7 @@ aesni_xts_encrypt:
        movdqa  96(%rsp),%xmm9
        pxor    %xmm14,%xmm6
 .byte  102,15,56,220,233
-       movups  32(%r11),%xmm0
+       movups  32(%rbp),%xmm0
        leaq    96(%rdi),%rdi
        pxor    %xmm8,%xmm7
 
@@ -1719,7 +1752,7 @@ aesni_xts_encrypt:
        pxor    %xmm9,%xmm11
        movdqa  %xmm10,0(%rsp)
 .byte  102,15,56,220,249
-       movups  48(%r11),%xmm1
+       movups  48(%rbp),%xmm1
        pxor    %xmm9,%xmm12
 
 .byte  102,15,56,220,208
@@ -1734,7 +1767,7 @@ aesni_xts_encrypt:
        movdqa  %xmm14,64(%rsp)
 .byte  102,15,56,220,240
 .byte  102,15,56,220,248
-       movups  64(%r11),%xmm0
+       movups  64(%rbp),%xmm0
        movdqa  %xmm8,80(%rsp)
        pshufd  $0x5f,%xmm15,%xmm9
        jmp     .Lxts_enc_loop6
@@ -1766,7 +1799,7 @@ aesni_xts_encrypt:
        psrad   $31,%xmm14
 .byte  102,15,56,220,217
        pand    %xmm8,%xmm14
-       movups  (%r11),%xmm10
+       movups  (%rbp),%xmm10
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
@@ -1834,10 +1867,10 @@ aesni_xts_encrypt:
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
        pxor    %xmm0,%xmm15
-       movups  (%r11),%xmm0
+       movups  (%rbp),%xmm0
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
-       movups  16(%r11),%xmm1
+       movups  16(%rbp),%xmm1
 
        pxor    %xmm15,%xmm14
 .byte  102,15,56,221,84,36,0
@@ -1864,7 +1897,7 @@ aesni_xts_encrypt:
 
        movl    $16+96,%eax
        subl    %r10d,%eax
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        shrl    $4,%eax
 
 .Lxts_enc_short:
@@ -2020,7 +2053,7 @@ aesni_xts_encrypt:
        jnz     .Lxts_enc_steal
 
        subq    %r9,%rsi
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        movl    %r10d,%eax
 
        movups  -16(%rsi),%xmm2
@@ -2063,20 +2096,25 @@ aesni_xts_encrypt:
        movaps  %xmm0,96(%rsp)
        pxor    %xmm14,%xmm14
        pxor    %xmm15,%xmm15
-       leaq    (%rbp),%rsp
-       popq    %rbp
+       movq    -8(%r11),%rbp
+.cfi_restore   %rbp
+       leaq    (%r11),%rsp
+.cfi_def_cfa_register  %rsp
 .Lxts_enc_epilogue:
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  aesni_xts_encrypt,.-aesni_xts_encrypt
 .globl aesni_xts_decrypt
 .type  aesni_xts_decrypt,@function
 .align 16
 aesni_xts_decrypt:
-       leaq    (%rsp),%rax
+.cfi_startproc 
+       leaq    (%rsp),%r11
+.cfi_def_cfa_register  %r11
        pushq   %rbp
+.cfi_offset    %rbp,-16
        subq    $112,%rsp
        andq    $-16,%rsp
-       leaq    -8(%rax),%rbp
        movups  (%r9),%xmm2
        movl    240(%r8),%eax
        movl    240(%rcx),%r10d
@@ -2098,7 +2136,7 @@ aesni_xts_decrypt:
        subq    %rax,%rdx
 
        movups  (%rcx),%xmm0
-       movq    %rcx,%r11
+       movq    %rcx,%rbp
        movl    %r10d,%eax
        shll    $4,%r10d
        movq    %rdx,%r9
@@ -2154,9 +2192,9 @@ aesni_xts_decrypt:
        jc      .Lxts_dec_short
 
        movl    $16+96,%eax
-       leaq    32(%r11,%r10,1),%rcx
+       leaq    32(%rbp,%r10,1),%rcx
        subq    %r10,%rax
-       movups  16(%r11),%xmm1
+       movups  16(%rbp),%xmm1
        movq    %rax,%r10
        leaq    .Lxts_magic(%rip),%r8
        jmp     .Lxts_dec_grandloop
@@ -2181,7 +2219,7 @@ aesni_xts_decrypt:
        movdqa  96(%rsp),%xmm9
        pxor    %xmm14,%xmm6
 .byte  102,15,56,222,233
-       movups  32(%r11),%xmm0
+       movups  32(%rbp),%xmm0
        leaq    96(%rdi),%rdi
        pxor    %xmm8,%xmm7
 
@@ -2190,7 +2228,7 @@ aesni_xts_decrypt:
        pxor    %xmm9,%xmm11
        movdqa  %xmm10,0(%rsp)
 .byte  102,15,56,222,249
-       movups  48(%r11),%xmm1
+       movups  48(%rbp),%xmm1
        pxor    %xmm9,%xmm12
 
 .byte  102,15,56,222,208
@@ -2205,7 +2243,7 @@ aesni_xts_decrypt:
        movdqa  %xmm14,64(%rsp)
 .byte  102,15,56,222,240
 .byte  102,15,56,222,248
-       movups  64(%r11),%xmm0
+       movups  64(%rbp),%xmm0
        movdqa  %xmm8,80(%rsp)
        pshufd  $0x5f,%xmm15,%xmm9
        jmp     .Lxts_dec_loop6
@@ -2237,7 +2275,7 @@ aesni_xts_decrypt:
        psrad   $31,%xmm14
 .byte  102,15,56,222,217
        pand    %xmm8,%xmm14
-       movups  (%r11),%xmm10
+       movups  (%rbp),%xmm10
 .byte  102,15,56,222,225
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
@@ -2305,10 +2343,10 @@ aesni_xts_decrypt:
 .byte  102,15,56,222,225
 .byte  102,15,56,222,233
        pxor    %xmm0,%xmm15
-       movups  (%r11),%xmm0
+       movups  (%rbp),%xmm0
 .byte  102,15,56,222,241
 .byte  102,15,56,222,249
-       movups  16(%r11),%xmm1
+       movups  16(%rbp),%xmm1
 
        pxor    %xmm15,%xmm14
 .byte  102,15,56,223,84,36,0
@@ -2335,7 +2373,7 @@ aesni_xts_decrypt:
 
        movl    $16+96,%eax
        subl    %r10d,%eax
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        shrl    $4,%eax
 
 .Lxts_dec_short:
@@ -2492,7 +2530,7 @@ aesni_xts_decrypt:
        jz      .Lxts_dec_ret
 .Lxts_dec_done2:
        movq    %r9,%rdx
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        movl    %r10d,%eax
 
        movups  (%rdi),%xmm2
@@ -2522,7 +2560,7 @@ aesni_xts_decrypt:
        jnz     .Lxts_dec_steal
 
        subq    %r9,%rsi
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        movl    %r10d,%eax
 
        movups  (%rsi),%xmm2
@@ -2565,221 +2603,1079 @@ aesni_xts_decrypt:
        movaps  %xmm0,96(%rsp)
        pxor    %xmm14,%xmm14
        pxor    %xmm15,%xmm15
-       leaq    (%rbp),%rsp
-       popq    %rbp
+       movq    -8(%r11),%rbp
+.cfi_restore   %rbp
+       leaq    (%r11),%rsp
+.cfi_def_cfa_register  %rsp
 .Lxts_dec_epilogue:
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  aesni_xts_decrypt,.-aesni_xts_decrypt
-.globl aesni_cbc_encrypt
-.type  aesni_cbc_encrypt,@function
-.align 16
-aesni_cbc_encrypt:
-       testq   %rdx,%rdx
-       jz      .Lcbc_ret
+.globl aesni_ocb_encrypt
+.type  aesni_ocb_encrypt,@function
+.align 32
+aesni_ocb_encrypt:
+.cfi_startproc 
+       leaq    (%rsp),%rax
+       pushq   %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r14,-48
+       movq    8(%rax),%rbx
+       movq    8+8(%rax),%rbp
 
        movl    240(%rcx),%r10d
        movq    %rcx,%r11
-       testl   %r9d,%r9d
-       jz      .Lcbc_decrypt
+       shll    $4,%r10d
+       movups  (%rcx),%xmm9
+       movups  16(%rcx,%r10,1),%xmm1
 
-       movups  (%r8),%xmm2
-       movl    %r10d,%eax
-       cmpq    $16,%rdx
-       jb      .Lcbc_enc_tail
-       subq    $16,%rdx
-       jmp     .Lcbc_enc_loop
-.align 16
-.Lcbc_enc_loop:
-       movups  (%rdi),%xmm3
-       leaq    16(%rdi),%rdi
+       movdqu  (%r9),%xmm15
+       pxor    %xmm1,%xmm9
+       pxor    %xmm1,%xmm15
 
-       movups  (%rcx),%xmm0
-       movups  16(%rcx),%xmm1
-       xorps   %xmm0,%xmm3
-       leaq    32(%rcx),%rcx
-       xorps   %xmm3,%xmm2
-.Loop_enc1_15:
-.byte  102,15,56,220,209
-       decl    %eax
-       movups  (%rcx),%xmm1
-       leaq    16(%rcx),%rcx
-       jnz     .Loop_enc1_15
-.byte  102,15,56,221,209
-       movl    %r10d,%eax
-       movq    %r11,%rcx
-       movups  %xmm2,0(%rsi)
-       leaq    16(%rsi),%rsi
-       subq    $16,%rdx
-       jnc     .Lcbc_enc_loop
-       addq    $16,%rdx
-       jnz     .Lcbc_enc_tail
-       pxor    %xmm0,%xmm0
-       pxor    %xmm1,%xmm1
-       movups  %xmm2,(%r8)
-       pxor    %xmm2,%xmm2
-       pxor    %xmm3,%xmm3
-       jmp     .Lcbc_ret
+       movl    $16+32,%eax
+       leaq    32(%r11,%r10,1),%rcx
+       movups  16(%r11),%xmm1
+       subq    %r10,%rax
+       movq    %rax,%r10
 
-.Lcbc_enc_tail:
-       movq    %rdx,%rcx
-       xchgq   %rdi,%rsi
-.long  0x9066A4F3
-       movl    $16,%ecx
-       subq    %rdx,%rcx
-       xorl    %eax,%eax
-.long  0x9066AAF3
-       leaq    -16(%rdi),%rdi
-       movl    %r10d,%eax
-       movq    %rdi,%rsi
-       movq    %r11,%rcx
-       xorq    %rdx,%rdx
-       jmp     .Lcbc_enc_loop
+       movdqu  (%rbx),%xmm10
+       movdqu  (%rbp),%xmm8
 
-.align 16
-.Lcbc_decrypt:
-       cmpq    $16,%rdx
-       jne     .Lcbc_decrypt_bulk
+       testq   $1,%r8
+       jnz     .Locb_enc_odd
 
+       bsfq    %r8,%r12
+       addq    $1,%r8
+       shlq    $4,%r12
+       movdqu  (%rbx,%r12,1),%xmm7
+       movdqu  (%rdi),%xmm2
+       leaq    16(%rdi),%rdi
 
+       call    __ocb_encrypt1
 
-       movdqu  (%rdi),%xmm2
-       movdqu  (%r8),%xmm3
-       movdqa  %xmm2,%xmm4
-       movups  (%rcx),%xmm0
-       movups  16(%rcx),%xmm1
-       leaq    32(%rcx),%rcx
-       xorps   %xmm0,%xmm2
-.Loop_dec1_16:
-.byte  102,15,56,222,209
-       decl    %r10d
-       movups  (%rcx),%xmm1
-       leaq    16(%rcx),%rcx
-       jnz     .Loop_dec1_16
-.byte  102,15,56,223,209
-       pxor    %xmm0,%xmm0
-       pxor    %xmm1,%xmm1
-       movdqu  %xmm4,(%r8)
-       xorps   %xmm3,%xmm2
-       pxor    %xmm3,%xmm3
+       movdqa  %xmm7,%xmm15
        movups  %xmm2,(%rsi)
-       pxor    %xmm2,%xmm2
-       jmp     .Lcbc_ret
-.align 16
-.Lcbc_decrypt_bulk:
-       leaq    (%rsp),%rax
-       pushq   %rbp
-       subq    $16,%rsp
-       andq    $-16,%rsp
-       leaq    -8(%rax),%rbp
-       movups  (%r8),%xmm10
-       movl    %r10d,%eax
-       cmpq    $0x50,%rdx
-       jbe     .Lcbc_dec_tail
+       leaq    16(%rsi),%rsi
+       subq    $1,%rdx
+       jz      .Locb_enc_done
+
+.Locb_enc_odd:
+       leaq    1(%r8),%r12
+       leaq    3(%r8),%r13
+       leaq    5(%r8),%r14
+       leaq    6(%r8),%r8
+       bsfq    %r12,%r12
+       bsfq    %r13,%r13
+       bsfq    %r14,%r14
+       shlq    $4,%r12
+       shlq    $4,%r13
+       shlq    $4,%r14
 
-       movups  (%rcx),%xmm0
+       subq    $6,%rdx
+       jc      .Locb_enc_short
+       jmp     .Locb_enc_grandloop
+
+.align 32
+.Locb_enc_grandloop:
        movdqu  0(%rdi),%xmm2
        movdqu  16(%rdi),%xmm3
-       movdqa  %xmm2,%xmm11
        movdqu  32(%rdi),%xmm4
-       movdqa  %xmm3,%xmm12
        movdqu  48(%rdi),%xmm5
-       movdqa  %xmm4,%xmm13
        movdqu  64(%rdi),%xmm6
-       movdqa  %xmm5,%xmm14
        movdqu  80(%rdi),%xmm7
-       movdqa  %xmm6,%xmm15
-       movl    _gnutls_x86_cpuid_s+4(%rip),%r9d
-       cmpq    $0x70,%rdx
-       jbe     .Lcbc_dec_six_or_seven
+       leaq    96(%rdi),%rdi
+
+       call    __ocb_encrypt6
+
+       movups  %xmm2,0(%rsi)
+       movups  %xmm3,16(%rsi)
+       movups  %xmm4,32(%rsi)
+       movups  %xmm5,48(%rsi)
+       movups  %xmm6,64(%rsi)
+       movups  %xmm7,80(%rsi)
+       leaq    96(%rsi),%rsi
+       subq    $6,%rdx
+       jnc     .Locb_enc_grandloop
+
+.Locb_enc_short:
+       addq    $6,%rdx
+       jz      .Locb_enc_done
+
+       movdqu  0(%rdi),%xmm2
+       cmpq    $2,%rdx
+       jb      .Locb_enc_one
+       movdqu  16(%rdi),%xmm3
+       je      .Locb_enc_two
+
+       movdqu  32(%rdi),%xmm4
+       cmpq    $4,%rdx
+       jb      .Locb_enc_three
+       movdqu  48(%rdi),%xmm5
+       je      .Locb_enc_four
+
+       movdqu  64(%rdi),%xmm6
+       pxor    %xmm7,%xmm7
+
+       call    __ocb_encrypt6
+
+       movdqa  %xmm14,%xmm15
+       movups  %xmm2,0(%rsi)
+       movups  %xmm3,16(%rsi)
+       movups  %xmm4,32(%rsi)
+       movups  %xmm5,48(%rsi)
+       movups  %xmm6,64(%rsi)
+
+       jmp     .Locb_enc_done
 
-       andl    $71303168,%r9d
-       subq    $0x50,%rdx
-       cmpl    $4194304,%r9d
-       je      .Lcbc_dec_loop6_enter
-       subq    $0x20,%rdx
-       leaq    112(%rcx),%rcx
-       jmp     .Lcbc_dec_loop8_enter
 .align 16
-.Lcbc_dec_loop8:
-       movups  %xmm9,(%rsi)
-       leaq    16(%rsi),%rsi
-.Lcbc_dec_loop8_enter:
-       movdqu  96(%rdi),%xmm8
-       pxor    %xmm0,%xmm2
-       movdqu  112(%rdi),%xmm9
-       pxor    %xmm0,%xmm3
-       movups  16-112(%rcx),%xmm1
-       pxor    %xmm0,%xmm4
-       xorq    %r11,%r11
-       cmpq    $0x70,%rdx
-       pxor    %xmm0,%xmm5
-       pxor    %xmm0,%xmm6
-       pxor    %xmm0,%xmm7
-       pxor    %xmm0,%xmm8
+.Locb_enc_one:
+       movdqa  %xmm10,%xmm7
 
-.byte  102,15,56,222,209
-       pxor    %xmm0,%xmm9
-       movups  32-112(%rcx),%xmm0
-.byte  102,15,56,222,217
-.byte  102,15,56,222,225
-.byte  102,15,56,222,233
-.byte  102,15,56,222,241
-.byte  102,15,56,222,249
-.byte  102,68,15,56,222,193
-       setnc   %r11b
-       shlq    $7,%r11
-.byte  102,68,15,56,222,201
-       addq    %rdi,%r11
-       movups  48-112(%rcx),%xmm1
-.byte  102,15,56,222,208
-.byte  102,15,56,222,216
-.byte  102,15,56,222,224
-.byte  102,15,56,222,232
-.byte  102,15,56,222,240
-.byte  102,15,56,222,248
-.byte  102,68,15,56,222,192
-.byte  102,68,15,56,222,200
-       movups  64-112(%rcx),%xmm0
-       nop
-.byte  102,15,56,222,209
-.byte  102,15,56,222,217
-.byte  102,15,56,222,225
-.byte  102,15,56,222,233
-.byte  102,15,56,222,241
-.byte  102,15,56,222,249
-.byte  102,68,15,56,222,193
-.byte  102,68,15,56,222,201
-       movups  80-112(%rcx),%xmm1
-       nop
-.byte  102,15,56,222,208
-.byte  102,15,56,222,216
-.byte  102,15,56,222,224
-.byte  102,15,56,222,232
-.byte  102,15,56,222,240
-.byte  102,15,56,222,248
-.byte  102,68,15,56,222,192
-.byte  102,68,15,56,222,200
-       movups  96-112(%rcx),%xmm0
-       nop
-.byte  102,15,56,222,209
-.byte  102,15,56,222,217
-.byte  102,15,56,222,225
-.byte  102,15,56,222,233
-.byte  102,15,56,222,241
-.byte  102,15,56,222,249
-.byte  102,68,15,56,222,193
-.byte  102,68,15,56,222,201
-       movups  112-112(%rcx),%xmm1
-       nop
-.byte  102,15,56,222,208
-.byte  102,15,56,222,216
-.byte  102,15,56,222,224
-.byte  102,15,56,222,232
-.byte  102,15,56,222,240
-.byte  102,15,56,222,248
-.byte  102,68,15,56,222,192
-.byte  102,68,15,56,222,200
-       movups  128-112(%rcx),%xmm0
+       call    __ocb_encrypt1
+
+       movdqa  %xmm7,%xmm15
+       movups  %xmm2,0(%rsi)
+       jmp     .Locb_enc_done
+
+.align 16
+.Locb_enc_two:
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+
+       call    __ocb_encrypt4
+
+       movdqa  %xmm11,%xmm15
+       movups  %xmm2,0(%rsi)
+       movups  %xmm3,16(%rsi)
+
+       jmp     .Locb_enc_done
+
+.align 16
+.Locb_enc_three:
+       pxor    %xmm5,%xmm5
+
+       call    __ocb_encrypt4
+
+       movdqa  %xmm12,%xmm15
+       movups  %xmm2,0(%rsi)
+       movups  %xmm3,16(%rsi)
+       movups  %xmm4,32(%rsi)
+
+       jmp     .Locb_enc_done
+
+.align 16
+.Locb_enc_four:
+       call    __ocb_encrypt4
+
+       movdqa  %xmm13,%xmm15
+       movups  %xmm2,0(%rsi)
+       movups  %xmm3,16(%rsi)
+       movups  %xmm4,32(%rsi)
+       movups  %xmm5,48(%rsi)
+
+.Locb_enc_done:
+       pxor    %xmm0,%xmm15
+       movdqu  %xmm8,(%rbp)
+       movdqu  %xmm15,(%r9)
+
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
+       pxor    %xmm10,%xmm10
+       pxor    %xmm11,%xmm11
+       pxor    %xmm12,%xmm12
+       pxor    %xmm13,%xmm13
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
+       leaq    40(%rsp),%rax
+.cfi_def_cfa   %rax,8
+       movq    -40(%rax),%r14
+.cfi_restore   %r14
+       movq    -32(%rax),%r13
+.cfi_restore   %r13
+       movq    -24(%rax),%r12
+.cfi_restore   %r12
+       movq    -16(%rax),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rax),%rbx
+.cfi_restore   %rbx
+       leaq    (%rax),%rsp
+.cfi_def_cfa_register  %rsp
+.Locb_enc_epilogue:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  aesni_ocb_encrypt,.-aesni_ocb_encrypt
+
+.type  __ocb_encrypt6,@function
+.align 32
+__ocb_encrypt6:
+       pxor    %xmm9,%xmm15
+       movdqu  (%rbx,%r12,1),%xmm11
+       movdqa  %xmm10,%xmm12
+       movdqu  (%rbx,%r13,1),%xmm13
+       movdqa  %xmm10,%xmm14
+       pxor    %xmm15,%xmm10
+       movdqu  (%rbx,%r14,1),%xmm15
+       pxor    %xmm10,%xmm11
+       pxor    %xmm2,%xmm8
+       pxor    %xmm10,%xmm2
+       pxor    %xmm11,%xmm12
+       pxor    %xmm3,%xmm8
+       pxor    %xmm11,%xmm3
+       pxor    %xmm12,%xmm13
+       pxor    %xmm4,%xmm8
+       pxor    %xmm12,%xmm4
+       pxor    %xmm13,%xmm14
+       pxor    %xmm5,%xmm8
+       pxor    %xmm13,%xmm5
+       pxor    %xmm14,%xmm15
+       pxor    %xmm6,%xmm8
+       pxor    %xmm14,%xmm6
+       pxor    %xmm7,%xmm8
+       pxor    %xmm15,%xmm7
+       movups  32(%r11),%xmm0
+
+       leaq    1(%r8),%r12
+       leaq    3(%r8),%r13
+       leaq    5(%r8),%r14
+       addq    $6,%r8
+       pxor    %xmm9,%xmm10
+       bsfq    %r12,%r12
+       bsfq    %r13,%r13
+       bsfq    %r14,%r14
+
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       pxor    %xmm9,%xmm11
+       pxor    %xmm9,%xmm12
+.byte  102,15,56,220,241
+       pxor    %xmm9,%xmm13
+       pxor    %xmm9,%xmm14
+.byte  102,15,56,220,249
+       movups  48(%r11),%xmm1
+       pxor    %xmm9,%xmm15
+
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+.byte  102,15,56,220,240
+.byte  102,15,56,220,248
+       movups  64(%r11),%xmm0
+       shlq    $4,%r12
+       shlq    $4,%r13
+       jmp     .Locb_enc_loop6
+
+.align 32
+.Locb_enc_loop6:
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+.byte  102,15,56,220,241
+.byte  102,15,56,220,249
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+.byte  102,15,56,220,240
+.byte  102,15,56,220,248
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     .Locb_enc_loop6
+
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+.byte  102,15,56,220,241
+.byte  102,15,56,220,249
+       movups  16(%r11),%xmm1
+       shlq    $4,%r14
+
+.byte  102,65,15,56,221,210
+       movdqu  (%rbx),%xmm10
+       movq    %r10,%rax
+.byte  102,65,15,56,221,219
+.byte  102,65,15,56,221,228
+.byte  102,65,15,56,221,237
+.byte  102,65,15,56,221,246
+.byte  102,65,15,56,221,255
+       .byte   0xf3,0xc3
+.size  __ocb_encrypt6,.-__ocb_encrypt6
+
+.type  __ocb_encrypt4,@function
+.align 32
+__ocb_encrypt4:
+       pxor    %xmm9,%xmm15
+       movdqu  (%rbx,%r12,1),%xmm11
+       movdqa  %xmm10,%xmm12
+       movdqu  (%rbx,%r13,1),%xmm13
+       pxor    %xmm15,%xmm10
+       pxor    %xmm10,%xmm11
+       pxor    %xmm2,%xmm8
+       pxor    %xmm10,%xmm2
+       pxor    %xmm11,%xmm12
+       pxor    %xmm3,%xmm8
+       pxor    %xmm11,%xmm3
+       pxor    %xmm12,%xmm13
+       pxor    %xmm4,%xmm8
+       pxor    %xmm12,%xmm4
+       pxor    %xmm5,%xmm8
+       pxor    %xmm13,%xmm5
+       movups  32(%r11),%xmm0
+
+       pxor    %xmm9,%xmm10
+       pxor    %xmm9,%xmm11
+       pxor    %xmm9,%xmm12
+       pxor    %xmm9,%xmm13
+
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       movups  48(%r11),%xmm1
+
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+       movups  64(%r11),%xmm0
+       jmp     .Locb_enc_loop4
+
+.align 32
+.Locb_enc_loop4:
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     .Locb_enc_loop4
+
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       movups  16(%r11),%xmm1
+       movq    %r10,%rax
+
+.byte  102,65,15,56,221,210
+.byte  102,65,15,56,221,219
+.byte  102,65,15,56,221,228
+.byte  102,65,15,56,221,237
+       .byte   0xf3,0xc3
+.size  __ocb_encrypt4,.-__ocb_encrypt4
+
+.type  __ocb_encrypt1,@function
+.align 32
+__ocb_encrypt1:
+       pxor    %xmm15,%xmm7
+       pxor    %xmm9,%xmm7
+       pxor    %xmm2,%xmm8
+       pxor    %xmm7,%xmm2
+       movups  32(%r11),%xmm0
+
+.byte  102,15,56,220,209
+       movups  48(%r11),%xmm1
+       pxor    %xmm9,%xmm7
+
+.byte  102,15,56,220,208
+       movups  64(%r11),%xmm0
+       jmp     .Locb_enc_loop1
+
+.align 32
+.Locb_enc_loop1:
+.byte  102,15,56,220,209
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,220,208
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     .Locb_enc_loop1
+
+.byte  102,15,56,220,209
+       movups  16(%r11),%xmm1
+       movq    %r10,%rax
+
+.byte  102,15,56,221,215
+       .byte   0xf3,0xc3
+.size  __ocb_encrypt1,.-__ocb_encrypt1
+
+.globl aesni_ocb_decrypt
+.type  aesni_ocb_decrypt,@function
+.align 32
+aesni_ocb_decrypt:
+.cfi_startproc 
+       leaq    (%rsp),%rax
+       pushq   %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r14,-48
+       movq    8(%rax),%rbx
+       movq    8+8(%rax),%rbp
+
+       movl    240(%rcx),%r10d
+       movq    %rcx,%r11
+       shll    $4,%r10d
+       movups  (%rcx),%xmm9
+       movups  16(%rcx,%r10,1),%xmm1
+
+       movdqu  (%r9),%xmm15
+       pxor    %xmm1,%xmm9
+       pxor    %xmm1,%xmm15
+
+       movl    $16+32,%eax
+       leaq    32(%r11,%r10,1),%rcx
+       movups  16(%r11),%xmm1
+       subq    %r10,%rax
+       movq    %rax,%r10
+
+       movdqu  (%rbx),%xmm10
+       movdqu  (%rbp),%xmm8
+
+       testq   $1,%r8
+       jnz     .Locb_dec_odd
+
+       bsfq    %r8,%r12
+       addq    $1,%r8
+       shlq    $4,%r12
+       movdqu  (%rbx,%r12,1),%xmm7
+       movdqu  (%rdi),%xmm2
+       leaq    16(%rdi),%rdi
+
+       call    __ocb_decrypt1
+
+       movdqa  %xmm7,%xmm15
+       movups  %xmm2,(%rsi)
+       xorps   %xmm2,%xmm8
+       leaq    16(%rsi),%rsi
+       subq    $1,%rdx
+       jz      .Locb_dec_done
+
+.Locb_dec_odd:
+       leaq    1(%r8),%r12
+       leaq    3(%r8),%r13
+       leaq    5(%r8),%r14
+       leaq    6(%r8),%r8
+       bsfq    %r12,%r12
+       bsfq    %r13,%r13
+       bsfq    %r14,%r14
+       shlq    $4,%r12
+       shlq    $4,%r13
+       shlq    $4,%r14
+
+       subq    $6,%rdx
+       jc      .Locb_dec_short
+       jmp     .Locb_dec_grandloop
+
+.align 32
+.Locb_dec_grandloop:
+       movdqu  0(%rdi),%xmm2
+       movdqu  16(%rdi),%xmm3
+       movdqu  32(%rdi),%xmm4
+       movdqu  48(%rdi),%xmm5
+       movdqu  64(%rdi),%xmm6
+       movdqu  80(%rdi),%xmm7
+       leaq    96(%rdi),%rdi
+
+       call    __ocb_decrypt6
+
+       movups  %xmm2,0(%rsi)
+       pxor    %xmm2,%xmm8
+       movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm8
+       movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm8
+       movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm8
+       movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm8
+       movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm8
+       leaq    96(%rsi),%rsi
+       subq    $6,%rdx
+       jnc     .Locb_dec_grandloop
+
+.Locb_dec_short:
+       addq    $6,%rdx
+       jz      .Locb_dec_done
+
+       movdqu  0(%rdi),%xmm2
+       cmpq    $2,%rdx
+       jb      .Locb_dec_one
+       movdqu  16(%rdi),%xmm3
+       je      .Locb_dec_two
+
+       movdqu  32(%rdi),%xmm4
+       cmpq    $4,%rdx
+       jb      .Locb_dec_three
+       movdqu  48(%rdi),%xmm5
+       je      .Locb_dec_four
+
+       movdqu  64(%rdi),%xmm6
+       pxor    %xmm7,%xmm7
+
+       call    __ocb_decrypt6
+
+       movdqa  %xmm14,%xmm15
+       movups  %xmm2,0(%rsi)
+       pxor    %xmm2,%xmm8
+       movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm8
+       movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm8
+       movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm8
+       movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm8
+
+       jmp     .Locb_dec_done
+
+.align 16
+.Locb_dec_one:
+       movdqa  %xmm10,%xmm7
+
+       call    __ocb_decrypt1
+
+       movdqa  %xmm7,%xmm15
+       movups  %xmm2,0(%rsi)
+       xorps   %xmm2,%xmm8
+       jmp     .Locb_dec_done
+
+.align 16
+.Locb_dec_two:
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+
+       call    __ocb_decrypt4
+
+       movdqa  %xmm11,%xmm15
+       movups  %xmm2,0(%rsi)
+       xorps   %xmm2,%xmm8
+       movups  %xmm3,16(%rsi)
+       xorps   %xmm3,%xmm8
+
+       jmp     .Locb_dec_done
+
+.align 16
+.Locb_dec_three:
+       pxor    %xmm5,%xmm5
+
+       call    __ocb_decrypt4
+
+       movdqa  %xmm12,%xmm15
+       movups  %xmm2,0(%rsi)
+       xorps   %xmm2,%xmm8
+       movups  %xmm3,16(%rsi)
+       xorps   %xmm3,%xmm8
+       movups  %xmm4,32(%rsi)
+       xorps   %xmm4,%xmm8
+
+       jmp     .Locb_dec_done
+
+.align 16
+.Locb_dec_four:
+       call    __ocb_decrypt4
+
+       movdqa  %xmm13,%xmm15
+       movups  %xmm2,0(%rsi)
+       pxor    %xmm2,%xmm8
+       movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm8
+       movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm8
+       movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm8
+
+.Locb_dec_done:
+       pxor    %xmm0,%xmm15
+       movdqu  %xmm8,(%rbp)
+       movdqu  %xmm15,(%r9)
+
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
+       pxor    %xmm10,%xmm10
+       pxor    %xmm11,%xmm11
+       pxor    %xmm12,%xmm12
+       pxor    %xmm13,%xmm13
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
+       leaq    40(%rsp),%rax
+.cfi_def_cfa   %rax,8
+       movq    -40(%rax),%r14
+.cfi_restore   %r14
+       movq    -32(%rax),%r13
+.cfi_restore   %r13
+       movq    -24(%rax),%r12
+.cfi_restore   %r12
+       movq    -16(%rax),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rax),%rbx
+.cfi_restore   %rbx
+       leaq    (%rax),%rsp
+.cfi_def_cfa_register  %rsp
+.Locb_dec_epilogue:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  aesni_ocb_decrypt,.-aesni_ocb_decrypt
+
+.type  __ocb_decrypt6,@function
+.align 32
+__ocb_decrypt6:
+       pxor    %xmm9,%xmm15
+       movdqu  (%rbx,%r12,1),%xmm11
+       movdqa  %xmm10,%xmm12
+       movdqu  (%rbx,%r13,1),%xmm13
+       movdqa  %xmm10,%xmm14
+       pxor    %xmm15,%xmm10
+       movdqu  (%rbx,%r14,1),%xmm15
+       pxor    %xmm10,%xmm11
+       pxor    %xmm10,%xmm2
+       pxor    %xmm11,%xmm12
+       pxor    %xmm11,%xmm3
+       pxor    %xmm12,%xmm13
+       pxor    %xmm12,%xmm4
+       pxor    %xmm13,%xmm14
+       pxor    %xmm13,%xmm5
+       pxor    %xmm14,%xmm15
+       pxor    %xmm14,%xmm6
+       pxor    %xmm15,%xmm7
+       movups  32(%r11),%xmm0
+
+       leaq    1(%r8),%r12
+       leaq    3(%r8),%r13
+       leaq    5(%r8),%r14
+       addq    $6,%r8
+       pxor    %xmm9,%xmm10
+       bsfq    %r12,%r12
+       bsfq    %r13,%r13
+       bsfq    %r14,%r14
+
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+       pxor    %xmm9,%xmm11
+       pxor    %xmm9,%xmm12
+.byte  102,15,56,222,241
+       pxor    %xmm9,%xmm13
+       pxor    %xmm9,%xmm14
+.byte  102,15,56,222,249
+       movups  48(%r11),%xmm1
+       pxor    %xmm9,%xmm15
+
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+.byte  102,15,56,222,240
+.byte  102,15,56,222,248
+       movups  64(%r11),%xmm0
+       shlq    $4,%r12
+       shlq    $4,%r13
+       jmp     .Locb_dec_loop6
+
+.align 32
+.Locb_dec_loop6:
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+.byte  102,15,56,222,241
+.byte  102,15,56,222,249
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+.byte  102,15,56,222,240
+.byte  102,15,56,222,248
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     .Locb_dec_loop6
+
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+.byte  102,15,56,222,241
+.byte  102,15,56,222,249
+       movups  16(%r11),%xmm1
+       shlq    $4,%r14
+
+.byte  102,65,15,56,223,210
+       movdqu  (%rbx),%xmm10
+       movq    %r10,%rax
+.byte  102,65,15,56,223,219
+.byte  102,65,15,56,223,228
+.byte  102,65,15,56,223,237
+.byte  102,65,15,56,223,246
+.byte  102,65,15,56,223,255
+       .byte   0xf3,0xc3
+.size  __ocb_decrypt6,.-__ocb_decrypt6
+
+.type  __ocb_decrypt4,@function
+.align 32
+__ocb_decrypt4:
+       pxor    %xmm9,%xmm15
+       movdqu  (%rbx,%r12,1),%xmm11
+       movdqa  %xmm10,%xmm12
+       movdqu  (%rbx,%r13,1),%xmm13
+       pxor    %xmm15,%xmm10
+       pxor    %xmm10,%xmm11
+       pxor    %xmm10,%xmm2
+       pxor    %xmm11,%xmm12
+       pxor    %xmm11,%xmm3
+       pxor    %xmm12,%xmm13
+       pxor    %xmm12,%xmm4
+       pxor    %xmm13,%xmm5
+       movups  32(%r11),%xmm0
+
+       pxor    %xmm9,%xmm10
+       pxor    %xmm9,%xmm11
+       pxor    %xmm9,%xmm12
+       pxor    %xmm9,%xmm13
+
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+       movups  48(%r11),%xmm1
+
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+       movups  64(%r11),%xmm0
+       jmp     .Locb_dec_loop4
+
+.align 32
+.Locb_dec_loop4:
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     .Locb_dec_loop4
+
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+       movups  16(%r11),%xmm1
+       movq    %r10,%rax
+
+.byte  102,65,15,56,223,210
+.byte  102,65,15,56,223,219
+.byte  102,65,15,56,223,228
+.byte  102,65,15,56,223,237
+       .byte   0xf3,0xc3
+.size  __ocb_decrypt4,.-__ocb_decrypt4
+
+.type  __ocb_decrypt1,@function
+.align 32
+__ocb_decrypt1:
+       pxor    %xmm15,%xmm7
+       pxor    %xmm9,%xmm7
+       pxor    %xmm7,%xmm2
+       movups  32(%r11),%xmm0
+
+.byte  102,15,56,222,209
+       movups  48(%r11),%xmm1
+       pxor    %xmm9,%xmm7
+
+.byte  102,15,56,222,208
+       movups  64(%r11),%xmm0
+       jmp     .Locb_dec_loop1
+
+.align 32
+.Locb_dec_loop1:
+.byte  102,15,56,222,209
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,222,208
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     .Locb_dec_loop1
+
+.byte  102,15,56,222,209
+       movups  16(%r11),%xmm1
+       movq    %r10,%rax
+
+.byte  102,15,56,223,215
+       .byte   0xf3,0xc3
+.size  __ocb_decrypt1,.-__ocb_decrypt1
+.globl aesni_cbc_encrypt
+.type  aesni_cbc_encrypt,@function
+.align 16
+aesni_cbc_encrypt:
+.cfi_startproc 
+       testq   %rdx,%rdx
+       jz      .Lcbc_ret
+
+       movl    240(%rcx),%r10d
+       movq    %rcx,%r11
+       testl   %r9d,%r9d
+       jz      .Lcbc_decrypt
+
+       movups  (%r8),%xmm2
+       movl    %r10d,%eax
+       cmpq    $16,%rdx
+       jb      .Lcbc_enc_tail
+       subq    $16,%rdx
+       jmp     .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+       movups  (%rdi),%xmm3
+       leaq    16(%rdi),%rdi
+
+       movups  (%rcx),%xmm0
+       movups  16(%rcx),%xmm1
+       xorps   %xmm0,%xmm3
+       leaq    32(%rcx),%rcx
+       xorps   %xmm3,%xmm2
+.Loop_enc1_15:
+.byte  102,15,56,220,209
+       decl    %eax
+       movups  (%rcx),%xmm1
+       leaq    16(%rcx),%rcx
+       jnz     .Loop_enc1_15
+.byte  102,15,56,221,209
+       movl    %r10d,%eax
+       movq    %r11,%rcx
+       movups  %xmm2,0(%rsi)
+       leaq    16(%rsi),%rsi
+       subq    $16,%rdx
+       jnc     .Lcbc_enc_loop
+       addq    $16,%rdx
+       jnz     .Lcbc_enc_tail
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       movups  %xmm2,(%r8)
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       jmp     .Lcbc_ret
+
+.Lcbc_enc_tail:
+       movq    %rdx,%rcx
+       xchgq   %rdi,%rsi
+.long  0x9066A4F3
+       movl    $16,%ecx
+       subq    %rdx,%rcx
+       xorl    %eax,%eax
+.long  0x9066AAF3
+       leaq    -16(%rdi),%rdi
+       movl    %r10d,%eax
+       movq    %rdi,%rsi
+       movq    %r11,%rcx
+       xorq    %rdx,%rdx
+       jmp     .Lcbc_enc_loop
+
+.align 16
+.Lcbc_decrypt:
+       cmpq    $16,%rdx
+       jne     .Lcbc_decrypt_bulk
+
+
+
+       movdqu  (%rdi),%xmm2
+       movdqu  (%r8),%xmm3
+       movdqa  %xmm2,%xmm4
+       movups  (%rcx),%xmm0
+       movups  16(%rcx),%xmm1
+       leaq    32(%rcx),%rcx
+       xorps   %xmm0,%xmm2
+.Loop_dec1_16:
+.byte  102,15,56,222,209
+       decl    %r10d
+       movups  (%rcx),%xmm1
+       leaq    16(%rcx),%rcx
+       jnz     .Loop_dec1_16
+.byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       movdqu  %xmm4,(%r8)
+       xorps   %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
+       movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
+       jmp     .Lcbc_ret
+.align 16
+.Lcbc_decrypt_bulk:
+       leaq    (%rsp),%r11
+.cfi_def_cfa_register  %r11
+       pushq   %rbp
+.cfi_offset    %rbp,-16
+       subq    $16,%rsp
+       andq    $-16,%rsp
+       movq    %rcx,%rbp
+       movups  (%r8),%xmm10
+       movl    %r10d,%eax
+       cmpq    $0x50,%rdx
+       jbe     .Lcbc_dec_tail
+
+       movups  (%rcx),%xmm0
+       movdqu  0(%rdi),%xmm2
+       movdqu  16(%rdi),%xmm3
+       movdqa  %xmm2,%xmm11
+       movdqu  32(%rdi),%xmm4
+       movdqa  %xmm3,%xmm12
+       movdqu  48(%rdi),%xmm5
+       movdqa  %xmm4,%xmm13
+       movdqu  64(%rdi),%xmm6
+       movdqa  %xmm5,%xmm14
+       movdqu  80(%rdi),%xmm7
+       movdqa  %xmm6,%xmm15
+       movl    _gnutls_x86_cpuid_s+4(%rip),%r9d
+       cmpq    $0x70,%rdx
+       jbe     .Lcbc_dec_six_or_seven
+
+       andl    $71303168,%r9d
+       subq    $0x50,%rdx
+       cmpl    $4194304,%r9d
+       je      .Lcbc_dec_loop6_enter
+       subq    $0x20,%rdx
+       leaq    112(%rcx),%rcx
+       jmp     .Lcbc_dec_loop8_enter
+.align 16
+.Lcbc_dec_loop8:
+       movups  %xmm9,(%rsi)
+       leaq    16(%rsi),%rsi
+.Lcbc_dec_loop8_enter:
+       movdqu  96(%rdi),%xmm8
+       pxor    %xmm0,%xmm2
+       movdqu  112(%rdi),%xmm9
+       pxor    %xmm0,%xmm3
+       movups  16-112(%rcx),%xmm1
+       pxor    %xmm0,%xmm4
+       movq    $-1,%rbp
+       cmpq    $0x70,%rdx
+       pxor    %xmm0,%xmm5
+       pxor    %xmm0,%xmm6
+       pxor    %xmm0,%xmm7
+       pxor    %xmm0,%xmm8
+
+.byte  102,15,56,222,209
+       pxor    %xmm0,%xmm9
+       movups  32-112(%rcx),%xmm0
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+.byte  102,15,56,222,241
+.byte  102,15,56,222,249
+.byte  102,68,15,56,222,193
+       adcq    $0,%rbp
+       andq    $128,%rbp
+.byte  102,68,15,56,222,201
+       addq    %rdi,%rbp
+       movups  48-112(%rcx),%xmm1
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+.byte  102,15,56,222,240
+.byte  102,15,56,222,248
+.byte  102,68,15,56,222,192
+.byte  102,68,15,56,222,200
+       movups  64-112(%rcx),%xmm0
+       nop
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+.byte  102,15,56,222,241
+.byte  102,15,56,222,249
+.byte  102,68,15,56,222,193
+.byte  102,68,15,56,222,201
+       movups  80-112(%rcx),%xmm1
+       nop
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+.byte  102,15,56,222,240
+.byte  102,15,56,222,248
+.byte  102,68,15,56,222,192
+.byte  102,68,15,56,222,200
+       movups  96-112(%rcx),%xmm0
+       nop
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+.byte  102,15,56,222,241
+.byte  102,15,56,222,249
+.byte  102,68,15,56,222,193
+.byte  102,68,15,56,222,201
+       movups  112-112(%rcx),%xmm1
+       nop
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+.byte  102,15,56,222,240
+.byte  102,15,56,222,248
+.byte  102,68,15,56,222,192
+.byte  102,68,15,56,222,200
+       movups  128-112(%rcx),%xmm0
        nop
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
@@ -2867,18 +3763,18 @@ aesni_cbc_encrypt:
        movdqu  112(%rdi),%xmm0
 .byte  102,65,15,56,223,228
        leaq    128(%rdi),%rdi
-       movdqu  0(%r11),%xmm11
+       movdqu  0(%rbp),%xmm11
 .byte  102,65,15,56,223,237
 .byte  102,65,15,56,223,246
-       movdqu  16(%r11),%xmm12
-       movdqu  32(%r11),%xmm13
+       movdqu  16(%rbp),%xmm12
+       movdqu  32(%rbp),%xmm13
 .byte  102,65,15,56,223,255
 .byte  102,68,15,56,223,193
-       movdqu  48(%r11),%xmm14
-       movdqu  64(%r11),%xmm15
+       movdqu  48(%rbp),%xmm14
+       movdqu  64(%rbp),%xmm15
 .byte  102,69,15,56,223,202
        movdqa  %xmm0,%xmm10
-       movdqu  80(%r11),%xmm1
+       movdqu  80(%rbp),%xmm1
        movups  -112(%rcx),%xmm0
 
        movups  %xmm2,(%rsi)
@@ -2997,7 +3893,7 @@ aesni_cbc_encrypt:
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
        pxor    %xmm14,%xmm6
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        movdqu  %xmm5,48(%rsi)
        pxor    %xmm15,%xmm7
        movl    %r10d,%eax
@@ -3150,16 +4046,21 @@ aesni_cbc_encrypt:
 .Lcbc_dec_ret:
        xorps   %xmm0,%xmm0
        pxor    %xmm1,%xmm1
-       leaq    (%rbp),%rsp
-       popq    %rbp
+       movq    -8(%r11),%rbp
+.cfi_restore   %rbp
+       leaq    (%r11),%rsp
+.cfi_def_cfa_register  %rsp
 .Lcbc_ret:
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  aesni_cbc_encrypt,.-aesni_cbc_encrypt
 .globl aesni_set_decrypt_key
 .type  aesni_set_decrypt_key,@function
 .align 16
 aesni_set_decrypt_key:
+.cfi_startproc 
 .byte  0x48,0x83,0xEC,0x08
+.cfi_adjust_cfa_offset 8
        call    __aesni_set_encrypt_key
        shll    $4,%esi
        testl   %eax,%eax
@@ -3192,7 +4093,9 @@ aesni_set_decrypt_key:
        pxor    %xmm0,%xmm0
 .Ldec_key_ret:
        addq    $8,%rsp
+.cfi_adjust_cfa_offset -8
        .byte   0xf3,0xc3
+.cfi_endproc   
 .LSEH_end_set_decrypt_key:
 .size  aesni_set_decrypt_key,.-aesni_set_decrypt_key
 .globl aesni_set_encrypt_key
@@ -3200,7 +4103,9 @@ aesni_set_decrypt_key:
 .align 16
 aesni_set_encrypt_key:
 __aesni_set_encrypt_key:
+.cfi_startproc 
 .byte  0x48,0x83,0xEC,0x08
+.cfi_adjust_cfa_offset 8
        movq    $-1,%rax
        testq   %rdi,%rdi
        jz      .Lenc_key_ret
@@ -3493,7 +4398,9 @@ __aesni_set_encrypt_key:
        pxor    %xmm4,%xmm4
        pxor    %xmm5,%xmm5
        addq    $8,%rsp
+.cfi_adjust_cfa_offset -8
        .byte   0xf3,0xc3
+.cfi_endproc   
 .LSEH_end_set_encrypt_key:
 
 .align 16
index 8817cd34f2fb46a0ad0869fbb4c6a31f1816af24..4427b6bdd0647dadf7fa5f9d9f93a28aa2553ab5 100644 (file)
@@ -21,7 +21,6 @@
 #
 # *** This file is auto-generated ***
 #
-.file  "devel/perlasm/cpuid-x86.s"
 .text
 .globl gnutls_cpuid
 .type  gnutls_cpuid,@function
index 82858b2444891ed5559591ae4600484e70b52ee6..0740edcd263b8dc6302cdd5998d55839d08e6741 100644 (file)
@@ -56,7 +56,4 @@ gnutls_cpuid:
        .byte   0xf3,0xc3
 .size  gnutls_cpuid,.-gnutls_cpuid
 
-
 .section .note.GNU-stack,"",%progbits
-
-
index e2568a6fd69ac534ff8223ad3edb4c886ad2002e..1e4d18b341b3f711389b598ae326d06240d67618 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 .type  gcm_gmult_4bit,@function
 .align 16
 gcm_gmult_4bit:
+.cfi_startproc 
        pushq   %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbx,-16
        pushq   %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbp,-24
        pushq   %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r15,-56
+       subq    $280,%rsp
+.cfi_adjust_cfa_offset 280
 .Lgmult_prologue:
 
        movzbq  15(%rdi),%r8
@@ -123,22 +141,41 @@ gcm_gmult_4bit:
        movq    %r8,8(%rdi)
        movq    %r9,(%rdi)
 
-       movq    16(%rsp),%rbx
-       leaq    24(%rsp),%rsp
+       leaq    280+48(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lgmult_epilogue:
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  gcm_gmult_4bit,.-gcm_gmult_4bit
 .globl gcm_ghash_4bit
 .type  gcm_ghash_4bit,@function
 .align 16
 gcm_ghash_4bit:
+.cfi_startproc 
        pushq   %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbx,-16
        pushq   %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbp,-24
        pushq   %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r12,-32
        pushq   %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r13,-40
        pushq   %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r14,-48
        pushq   %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r15,-56
        subq    $280,%rsp
+.cfi_adjust_cfa_offset 280
 .Lghash_prologue:
        movq    %rdx,%r14
        movq    %rcx,%r15
@@ -683,21 +720,31 @@ gcm_ghash_4bit:
        movq    %r8,8(%rdi)
        movq    %r9,(%rdi)
 
-       leaq    280(%rsp),%rsi
-       movq    0(%rsi),%r15
-       movq    8(%rsi),%r14
-       movq    16(%rsi),%r13
-       movq    24(%rsi),%r12
-       movq    32(%rsi),%rbp
-       movq    40(%rsi),%rbx
-       leaq    48(%rsi),%rsp
+       leaq    280+48(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       movq    -48(%rsi),%r15
+.cfi_restore   %r15
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    0(%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lghash_epilogue:
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  gcm_ghash_4bit,.-gcm_ghash_4bit
 .globl gcm_init_clmul
 .type  gcm_init_clmul,@function
 .align 16
 gcm_init_clmul:
+.cfi_startproc 
 .L_init_clmul:
        movdqu  (%rsi),%xmm2
        pshufd  $78,%xmm2,%xmm2
@@ -849,11 +896,13 @@ gcm_init_clmul:
 .byte  102,15,58,15,227,8
        movdqu  %xmm4,80(%rdi)
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  gcm_init_clmul,.-gcm_init_clmul
 .globl gcm_gmult_clmul
 .type  gcm_gmult_clmul,@function
 .align 16
 gcm_gmult_clmul:
+.cfi_startproc 
 .L_gmult_clmul:
        movdqu  (%rdi),%xmm0
        movdqa  .Lbswap_mask(%rip),%xmm5
@@ -900,11 +949,13 @@ gcm_gmult_clmul:
 .byte  102,15,56,0,197
        movdqu  %xmm0,(%rdi)
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  gcm_gmult_clmul,.-gcm_gmult_clmul
 .globl gcm_ghash_clmul
 .type  gcm_ghash_clmul,@function
 .align 32
 gcm_ghash_clmul:
+.cfi_startproc 
 .L_ghash_clmul:
        movdqa  .Lbswap_mask(%rip),%xmm10
 
@@ -1283,11 +1334,13 @@ gcm_ghash_clmul:
 .byte  102,65,15,56,0,194
        movdqu  %xmm0,(%rdi)
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  gcm_ghash_clmul,.-gcm_ghash_clmul
 .globl gcm_init_avx
 .type  gcm_init_avx,@function
 .align 32
 gcm_init_avx:
+.cfi_startproc 
        vzeroupper
 
        vmovdqu (%rsi),%xmm2
@@ -1390,17 +1443,21 @@ gcm_init_avx:
 
        vzeroupper
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  gcm_init_avx,.-gcm_init_avx
 .globl gcm_gmult_avx
 .type  gcm_gmult_avx,@function
 .align 32
 gcm_gmult_avx:
+.cfi_startproc 
        jmp     .L_gmult_clmul
+.cfi_endproc   
 .size  gcm_gmult_avx,.-gcm_gmult_avx
 .globl gcm_ghash_avx
 .type  gcm_ghash_avx,@function
 .align 32
 gcm_ghash_avx:
+.cfi_startproc 
        vzeroupper
 
        vmovdqu (%rdi),%xmm10
@@ -1772,6 +1829,7 @@ gcm_ghash_avx:
        vmovdqu %xmm10,(%rdi)
        vzeroupper
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  gcm_ghash_avx,.-gcm_ghash_avx
 .align 64
 .Lbswap_mask:
index 7b585a0f3e650d5ecfeabbeb44430ecec349e041..8bfbcb6b397ff1fdaeb43a4074ad8ad0ee7cb35d 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,6 @@
 #
 # *** This file is auto-generated ***
 #
-.file  "sha1-586.s"
 .text
 .globl sha1_block_data_order
 .type  sha1_block_data_order,@function
@@ -1418,7 +1417,4 @@ sha1_block_data_order:
 .byte  89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
 .byte  114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 
-
 .section .note.GNU-stack,"",%progbits
-
-
index af40532f12ff8416315e5bbb13a56a2eeb488e3b..1e6546e11e60fef80add577a2211e484cfd99ebf 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 .type  sha1_block_data_order,@function
 .align 16
 sha1_block_data_order:
+.cfi_startproc 
        movl    _gnutls_x86_cpuid_s+0(%rip),%r9d
        movl    _gnutls_x86_cpuid_s+4(%rip),%r8d
+       movl    _gnutls_x86_cpuid_s+8(%rip),%r10d
        testl   $512,%r8d
        jz      .Lialu
+       testl   $536870912,%r10d
+       jnz     _shaext_shortcut
+       andl    $296,%r10d
+       cmpl    $296,%r10d
+       je      _avx2_shortcut
+       andl    $268435456,%r8d
+       andl    $1073741824,%r9d
+       orl     %r9d,%r8d
+       cmpl    $1342177280,%r8d
+       je      _avx_shortcut
        jmp     _ssse3_shortcut
 
 .align 16
 .Lialu:
+       movq    %rsp,%rax
+.cfi_def_cfa_register  %rax
        pushq   %rbx
+.cfi_offset    %rbx,-16
        pushq   %rbp
+.cfi_offset    %rbp,-24
        pushq   %r12
+.cfi_offset    %r12,-32
        pushq   %r13
-       movq    %rsp,%r11
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_offset    %r14,-48
        movq    %rdi,%r8
        subq    $72,%rsp
        movq    %rsi,%r9
        andq    $-64,%rsp
        movq    %rdx,%r10
-       movq    %r11,64(%rsp)
+       movq    %rax,64(%rsp)
+.cfi_escape    0x0f,0x06,0x77,0xc0,0x00,0x06,0x23,0x08
 .Lprologue:
 
        movl    0(%r8),%esi
@@ -76,1230 +96,1168 @@ sha1_block_data_order:
 .Lloop:
        movl    0(%r9),%edx
        bswapl  %edx
-       movl    %edx,0(%rsp)
-       movl    %r11d,%eax
        movl    4(%r9),%ebp
+       movl    %r12d,%eax
+       movl    %edx,0(%rsp)
        movl    %esi,%ecx
-       xorl    %r12d,%eax
        bswapl  %ebp
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%r13,1),%r13d
        andl    %edi,%eax
-       movl    %ebp,4(%rsp)
+       leal    1518500249(%rdx,%r13,1),%r13d
        addl    %ecx,%r13d
        xorl    %r12d,%eax
        roll    $30,%edi
        addl    %eax,%r13d
-       movl    %edi,%eax
-       movl    8(%r9),%edx
+       movl    8(%r9),%r14d
+       movl    %r11d,%eax
+       movl    %ebp,4(%rsp)
        movl    %r13d,%ecx
-       xorl    %r11d,%eax
-       bswapl  %edx
+       bswapl  %r14d
+       xorl    %edi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%r12,1),%r12d
        andl    %esi,%eax
-       movl    %edx,8(%rsp)
+       leal    1518500249(%rbp,%r12,1),%r12d
        addl    %ecx,%r12d
        xorl    %r11d,%eax
        roll    $30,%esi
        addl    %eax,%r12d
-       movl    %esi,%eax
-       movl    12(%r9),%ebp
+       movl    12(%r9),%edx
+       movl    %edi,%eax
+       movl    %r14d,8(%rsp)
        movl    %r12d,%ecx
-       xorl    %edi,%eax
-       bswapl  %ebp
+       bswapl  %edx
+       xorl    %esi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%r11,1),%r11d
        andl    %r13d,%eax
-       movl    %ebp,12(%rsp)
+       leal    1518500249(%r14,%r11,1),%r11d
        addl    %ecx,%r11d
        xorl    %edi,%eax
        roll    $30,%r13d
        addl    %eax,%r11d
-       movl    %r13d,%eax
-       movl    16(%r9),%edx
+       movl    16(%r9),%ebp
+       movl    %esi,%eax
+       movl    %edx,12(%rsp)
        movl    %r11d,%ecx
-       xorl    %esi,%eax
-       bswapl  %edx
+       bswapl  %ebp
+       xorl    %r13d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%rdi,1),%edi
        andl    %r12d,%eax
-       movl    %edx,16(%rsp)
+       leal    1518500249(%rdx,%rdi,1),%edi
        addl    %ecx,%edi
        xorl    %esi,%eax
        roll    $30,%r12d
        addl    %eax,%edi
-       movl    %r12d,%eax
-       movl    20(%r9),%ebp
+       movl    20(%r9),%r14d
+       movl    %r13d,%eax
+       movl    %ebp,16(%rsp)
        movl    %edi,%ecx
-       xorl    %r13d,%eax
-       bswapl  %ebp
+       bswapl  %r14d
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%rsi,1),%esi
        andl    %r11d,%eax
-       movl    %ebp,20(%rsp)
+       leal    1518500249(%rbp,%rsi,1),%esi
        addl    %ecx,%esi
        xorl    %r13d,%eax
        roll    $30,%r11d
        addl    %eax,%esi
-       movl    %r11d,%eax
        movl    24(%r9),%edx
+       movl    %r12d,%eax
+       movl    %r14d,20(%rsp)
        movl    %esi,%ecx
-       xorl    %r12d,%eax
        bswapl  %edx
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%r13,1),%r13d
        andl    %edi,%eax
-       movl    %edx,24(%rsp)
+       leal    1518500249(%r14,%r13,1),%r13d
        addl    %ecx,%r13d
        xorl    %r12d,%eax
        roll    $30,%edi
        addl    %eax,%r13d
-       movl    %edi,%eax
        movl    28(%r9),%ebp
+       movl    %r11d,%eax
+       movl    %edx,24(%rsp)
        movl    %r13d,%ecx
-       xorl    %r11d,%eax
        bswapl  %ebp
+       xorl    %edi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%r12,1),%r12d
        andl    %esi,%eax
-       movl    %ebp,28(%rsp)
+       leal    1518500249(%rdx,%r12,1),%r12d
        addl    %ecx,%r12d
        xorl    %r11d,%eax
        roll    $30,%esi
        addl    %eax,%r12d
-       movl    %esi,%eax
-       movl    32(%r9),%edx
+       movl    32(%r9),%r14d
+       movl    %edi,%eax
+       movl    %ebp,28(%rsp)
        movl    %r12d,%ecx
-       xorl    %edi,%eax
-       bswapl  %edx
+       bswapl  %r14d
+       xorl    %esi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%r11,1),%r11d
        andl    %r13d,%eax
-       movl    %edx,32(%rsp)
+       leal    1518500249(%rbp,%r11,1),%r11d
        addl    %ecx,%r11d
        xorl    %edi,%eax
        roll    $30,%r13d
        addl    %eax,%r11d
-       movl    %r13d,%eax
-       movl    36(%r9),%ebp
+       movl    36(%r9),%edx
+       movl    %esi,%eax
+       movl    %r14d,32(%rsp)
        movl    %r11d,%ecx
-       xorl    %esi,%eax
-       bswapl  %ebp
+       bswapl  %edx
+       xorl    %r13d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%rdi,1),%edi
        andl    %r12d,%eax
-       movl    %ebp,36(%rsp)
+       leal    1518500249(%r14,%rdi,1),%edi
        addl    %ecx,%edi
        xorl    %esi,%eax
        roll    $30,%r12d
        addl    %eax,%edi
-       movl    %r12d,%eax
-       movl    40(%r9),%edx
+       movl    40(%r9),%ebp
+       movl    %r13d,%eax
+       movl    %edx,36(%rsp)
        movl    %edi,%ecx
-       xorl    %r13d,%eax
-       bswapl  %edx
+       bswapl  %ebp
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%rsi,1),%esi
        andl    %r11d,%eax
-       movl    %edx,40(%rsp)
+       leal    1518500249(%rdx,%rsi,1),%esi
        addl    %ecx,%esi
        xorl    %r13d,%eax
        roll    $30,%r11d
        addl    %eax,%esi
-       movl    %r11d,%eax
-       movl    44(%r9),%ebp
+       movl    44(%r9),%r14d
+       movl    %r12d,%eax
+       movl    %ebp,40(%rsp)
        movl    %esi,%ecx
-       xorl    %r12d,%eax
-       bswapl  %ebp
+       bswapl  %r14d
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%r13,1),%r13d
        andl    %edi,%eax
-       movl    %ebp,44(%rsp)
+       leal    1518500249(%rbp,%r13,1),%r13d
        addl    %ecx,%r13d
        xorl    %r12d,%eax
        roll    $30,%edi
        addl    %eax,%r13d
-       movl    %edi,%eax
        movl    48(%r9),%edx
+       movl    %r11d,%eax
+       movl    %r14d,44(%rsp)
        movl    %r13d,%ecx
-       xorl    %r11d,%eax
        bswapl  %edx
+       xorl    %edi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%r12,1),%r12d
        andl    %esi,%eax
-       movl    %edx,48(%rsp)
+       leal    1518500249(%r14,%r12,1),%r12d
        addl    %ecx,%r12d
        xorl    %r11d,%eax
        roll    $30,%esi
        addl    %eax,%r12d
-       movl    %esi,%eax
        movl    52(%r9),%ebp
+       movl    %edi,%eax
+       movl    %edx,48(%rsp)
        movl    %r12d,%ecx
-       xorl    %edi,%eax
        bswapl  %ebp
+       xorl    %esi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%r11,1),%r11d
        andl    %r13d,%eax
-       movl    %ebp,52(%rsp)
+       leal    1518500249(%rdx,%r11,1),%r11d
        addl    %ecx,%r11d
        xorl    %edi,%eax
        roll    $30,%r13d
        addl    %eax,%r11d
-       movl    %r13d,%eax
-       movl    56(%r9),%edx
+       movl    56(%r9),%r14d
+       movl    %esi,%eax
+       movl    %ebp,52(%rsp)
        movl    %r11d,%ecx
-       xorl    %esi,%eax
-       bswapl  %edx
+       bswapl  %r14d
+       xorl    %r13d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%rdi,1),%edi
        andl    %r12d,%eax
-       movl    %edx,56(%rsp)
+       leal    1518500249(%rbp,%rdi,1),%edi
        addl    %ecx,%edi
        xorl    %esi,%eax
        roll    $30,%r12d
        addl    %eax,%edi
-       movl    %r12d,%eax
-       movl    60(%r9),%ebp
+       movl    60(%r9),%edx
+       movl    %r13d,%eax
+       movl    %r14d,56(%rsp)
        movl    %edi,%ecx
-       xorl    %r13d,%eax
-       bswapl  %ebp
+       bswapl  %edx
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%rsi,1),%esi
        andl    %r11d,%eax
-       movl    %ebp,60(%rsp)
+       leal    1518500249(%r14,%rsi,1),%esi
        addl    %ecx,%esi
        xorl    %r13d,%eax
        roll    $30,%r11d
        addl    %eax,%esi
-       movl    0(%rsp),%edx
-       movl    %r11d,%eax
+       xorl    0(%rsp),%ebp
+       movl    %r12d,%eax
+       movl    %edx,60(%rsp)
        movl    %esi,%ecx
-       xorl    8(%rsp),%edx
-       xorl    %r12d,%eax
+       xorl    8(%rsp),%ebp
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       xorl    32(%rsp),%edx
+       xorl    32(%rsp),%ebp
        andl    %edi,%eax
-       leal    1518500249(%rbp,%r13,1),%r13d
-       xorl    52(%rsp),%edx
+       leal    1518500249(%rdx,%r13,1),%r13d
+       roll    $30,%edi
        xorl    %r12d,%eax
-       roll    $1,%edx
        addl    %ecx,%r13d
-       roll    $30,%edi
-       movl    %edx,0(%rsp)
+       roll    $1,%ebp
        addl    %eax,%r13d
-       movl    4(%rsp),%ebp
-       movl    %edi,%eax
+       xorl    4(%rsp),%r14d
+       movl    %r11d,%eax
+       movl    %ebp,0(%rsp)
        movl    %r13d,%ecx
-       xorl    12(%rsp),%ebp
-       xorl    %r11d,%eax
+       xorl    12(%rsp),%r14d
+       xorl    %edi,%eax
        roll    $5,%ecx
-       xorl    36(%rsp),%ebp
+       xorl    36(%rsp),%r14d
        andl    %esi,%eax
-       leal    1518500249(%rdx,%r12,1),%r12d
-       xorl    56(%rsp),%ebp
+       leal    1518500249(%rbp,%r12,1),%r12d
+       roll    $30,%esi
        xorl    %r11d,%eax
-       roll    $1,%ebp
        addl    %ecx,%r12d
-       roll    $30,%esi
-       movl    %ebp,4(%rsp)
+       roll    $1,%r14d
        addl    %eax,%r12d
-       movl    8(%rsp),%edx
-       movl    %esi,%eax
+       xorl    8(%rsp),%edx
+       movl    %edi,%eax
+       movl    %r14d,4(%rsp)
        movl    %r12d,%ecx
        xorl    16(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    %esi,%eax
        roll    $5,%ecx
        xorl    40(%rsp),%edx
        andl    %r13d,%eax
-       leal    1518500249(%rbp,%r11,1),%r11d
-       xorl    60(%rsp),%edx
+       leal    1518500249(%r14,%r11,1),%r11d
+       roll    $30,%r13d
        xorl    %edi,%eax
-       roll    $1,%edx
        addl    %ecx,%r11d
-       roll    $30,%r13d
-       movl    %edx,8(%rsp)
+       roll    $1,%edx
        addl    %eax,%r11d
-       movl    12(%rsp),%ebp
-       movl    %r13d,%eax
+       xorl    12(%rsp),%ebp
+       movl    %esi,%eax
+       movl    %edx,8(%rsp)
        movl    %r11d,%ecx
        xorl    20(%rsp),%ebp
-       xorl    %esi,%eax
+       xorl    %r13d,%eax
        roll    $5,%ecx
        xorl    44(%rsp),%ebp
        andl    %r12d,%eax
        leal    1518500249(%rdx,%rdi,1),%edi
-       xorl    0(%rsp),%ebp
+       roll    $30,%r12d
        xorl    %esi,%eax
-       roll    $1,%ebp
        addl    %ecx,%edi
-       roll    $30,%r12d
-       movl    %ebp,12(%rsp)
+       roll    $1,%ebp
        addl    %eax,%edi
-       movl    16(%rsp),%edx
-       movl    %r12d,%eax
+       xorl    16(%rsp),%r14d
+       movl    %r13d,%eax
+       movl    %ebp,12(%rsp)
        movl    %edi,%ecx
-       xorl    24(%rsp),%edx
-       xorl    %r13d,%eax
+       xorl    24(%rsp),%r14d
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       xorl    48(%rsp),%edx
+       xorl    48(%rsp),%r14d
        andl    %r11d,%eax
        leal    1518500249(%rbp,%rsi,1),%esi
-       xorl    4(%rsp),%edx
+       roll    $30,%r11d
        xorl    %r13d,%eax
-       roll    $1,%edx
        addl    %ecx,%esi
-       roll    $30,%r11d
-       movl    %edx,16(%rsp)
+       roll    $1,%r14d
        addl    %eax,%esi
-       movl    20(%rsp),%ebp
-       movl    %r11d,%eax
+       xorl    20(%rsp),%edx
+       movl    %edi,%eax
+       movl    %r14d,16(%rsp)
        movl    %esi,%ecx
-       xorl    28(%rsp),%ebp
-       xorl    %edi,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%r13,1),%r13d
-       xorl    52(%rsp),%ebp
+       xorl    28(%rsp),%edx
        xorl    %r12d,%eax
+       roll    $5,%ecx
+       xorl    52(%rsp),%edx
+       leal    1859775393(%r14,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    8(%rsp),%ebp
        roll    $30,%edi
        addl    %eax,%r13d
-       roll    $1,%ebp
-       movl    %ebp,20(%rsp)
-       movl    24(%rsp),%edx
-       movl    %edi,%eax
+       roll    $1,%edx
+       xorl    24(%rsp),%ebp
+       movl    %esi,%eax
+       movl    %edx,20(%rsp)
        movl    %r13d,%ecx
-       xorl    32(%rsp),%edx
-       xorl    %esi,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rbp,%r12,1),%r12d
-       xorl    56(%rsp),%edx
+       xorl    32(%rsp),%ebp
        xorl    %r11d,%eax
+       roll    $5,%ecx
+       xorl    56(%rsp),%ebp
+       leal    1859775393(%rdx,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    12(%rsp),%edx
        roll    $30,%esi
        addl    %eax,%r12d
-       roll    $1,%edx
-       movl    %edx,24(%rsp)
-       movl    28(%rsp),%ebp
-       movl    %esi,%eax
+       roll    $1,%ebp
+       xorl    28(%rsp),%r14d
+       movl    %r13d,%eax
+       movl    %ebp,24(%rsp)
        movl    %r12d,%ecx
-       xorl    36(%rsp),%ebp
-       xorl    %r13d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%r11,1),%r11d
-       xorl    60(%rsp),%ebp
+       xorl    36(%rsp),%r14d
        xorl    %edi,%eax
+       roll    $5,%ecx
+       xorl    60(%rsp),%r14d
+       leal    1859775393(%rbp,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    16(%rsp),%ebp
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%ebp
-       movl    %ebp,28(%rsp)
-       movl    32(%rsp),%edx
-       movl    %r13d,%eax
+       roll    $1,%r14d
+       xorl    32(%rsp),%edx
+       movl    %r12d,%eax
+       movl    %r14d,28(%rsp)
        movl    %r11d,%ecx
        xorl    40(%rsp),%edx
-       xorl    %r12d,%eax
+       xorl    %esi,%eax
        roll    $5,%ecx
-       leal    1859775393(%rbp,%rdi,1),%edi
        xorl    0(%rsp),%edx
-       xorl    %esi,%eax
+       leal    1859775393(%r14,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    20(%rsp),%edx
        roll    $30,%r12d
        addl    %eax,%edi
        roll    $1,%edx
+       xorl    36(%rsp),%ebp
+       movl    %r11d,%eax
        movl    %edx,32(%rsp)
-       movl    36(%rsp),%ebp
-       movl    %r12d,%eax
        movl    %edi,%ecx
        xorl    44(%rsp),%ebp
-       xorl    %r11d,%eax
+       xorl    %r13d,%eax
        roll    $5,%ecx
-       leal    1859775393(%rdx,%rsi,1),%esi
        xorl    4(%rsp),%ebp
-       xorl    %r13d,%eax
+       leal    1859775393(%rdx,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    24(%rsp),%ebp
        roll    $30,%r11d
        addl    %eax,%esi
        roll    $1,%ebp
+       xorl    40(%rsp),%r14d
+       movl    %edi,%eax
        movl    %ebp,36(%rsp)
-       movl    40(%rsp),%edx
-       movl    %r11d,%eax
        movl    %esi,%ecx
-       xorl    48(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    48(%rsp),%r14d
+       xorl    %r12d,%eax
        roll    $5,%ecx
+       xorl    8(%rsp),%r14d
        leal    1859775393(%rbp,%r13,1),%r13d
-       xorl    8(%rsp),%edx
-       xorl    %r12d,%eax
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    28(%rsp),%edx
        roll    $30,%edi
        addl    %eax,%r13d
-       roll    $1,%edx
-       movl    %edx,40(%rsp)
-       movl    44(%rsp),%ebp
-       movl    %edi,%eax
+       roll    $1,%r14d
+       xorl    44(%rsp),%edx
+       movl    %esi,%eax
+       movl    %r14d,40(%rsp)
        movl    %r13d,%ecx
-       xorl    52(%rsp),%ebp
-       xorl    %esi,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%r12,1),%r12d
-       xorl    12(%rsp),%ebp
+       xorl    52(%rsp),%edx
        xorl    %r11d,%eax
+       roll    $5,%ecx
+       xorl    12(%rsp),%edx
+       leal    1859775393(%r14,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    32(%rsp),%ebp
        roll    $30,%esi
        addl    %eax,%r12d
-       roll    $1,%ebp
-       movl    %ebp,44(%rsp)
-       movl    48(%rsp),%edx
-       movl    %esi,%eax
+       roll    $1,%edx
+       xorl    48(%rsp),%ebp
+       movl    %r13d,%eax
+       movl    %edx,44(%rsp)
        movl    %r12d,%ecx
-       xorl    56(%rsp),%edx
-       xorl    %r13d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rbp,%r11,1),%r11d
-       xorl    16(%rsp),%edx
+       xorl    56(%rsp),%ebp
        xorl    %edi,%eax
+       roll    $5,%ecx
+       xorl    16(%rsp),%ebp
+       leal    1859775393(%rdx,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    36(%rsp),%edx
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%edx
-       movl    %edx,48(%rsp)
-       movl    52(%rsp),%ebp
-       movl    %r13d,%eax
+       roll    $1,%ebp
+       xorl    52(%rsp),%r14d
+       movl    %r12d,%eax
+       movl    %ebp,48(%rsp)
        movl    %r11d,%ecx
-       xorl    60(%rsp),%ebp
-       xorl    %r12d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%rdi,1),%edi
-       xorl    20(%rsp),%ebp
+       xorl    60(%rsp),%r14d
        xorl    %esi,%eax
+       roll    $5,%ecx
+       xorl    20(%rsp),%r14d
+       leal    1859775393(%rbp,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    40(%rsp),%ebp
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%ebp
-       movl    %ebp,52(%rsp)
-       movl    56(%rsp),%edx
-       movl    %r12d,%eax
+       roll    $1,%r14d
+       xorl    56(%rsp),%edx
+       movl    %r11d,%eax
+       movl    %r14d,52(%rsp)
        movl    %edi,%ecx
        xorl    0(%rsp),%edx
-       xorl    %r11d,%eax
+       xorl    %r13d,%eax
        roll    $5,%ecx
-       leal    1859775393(%rbp,%rsi,1),%esi
        xorl    24(%rsp),%edx
-       xorl    %r13d,%eax
+       leal    1859775393(%r14,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    44(%rsp),%edx
        roll    $30,%r11d
        addl    %eax,%esi
        roll    $1,%edx
+       xorl    60(%rsp),%ebp
+       movl    %edi,%eax
        movl    %edx,56(%rsp)
-       movl    60(%rsp),%ebp
-       movl    %r11d,%eax
        movl    %esi,%ecx
        xorl    4(%rsp),%ebp
-       xorl    %edi,%eax
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    1859775393(%rdx,%r13,1),%r13d
        xorl    28(%rsp),%ebp
-       xorl    %r12d,%eax
+       leal    1859775393(%rdx,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    48(%rsp),%ebp
        roll    $30,%edi
        addl    %eax,%r13d
        roll    $1,%ebp
+       xorl    0(%rsp),%r14d
+       movl    %esi,%eax
        movl    %ebp,60(%rsp)
-       movl    0(%rsp),%edx
-       movl    %edi,%eax
        movl    %r13d,%ecx
-       xorl    8(%rsp),%edx
-       xorl    %esi,%eax
+       xorl    8(%rsp),%r14d
+       xorl    %r11d,%eax
        roll    $5,%ecx
+       xorl    32(%rsp),%r14d
        leal    1859775393(%rbp,%r12,1),%r12d
-       xorl    32(%rsp),%edx
-       xorl    %r11d,%eax
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    52(%rsp),%edx
        roll    $30,%esi
        addl    %eax,%r12d
-       roll    $1,%edx
-       movl    %edx,0(%rsp)
-       movl    4(%rsp),%ebp
-       movl    %esi,%eax
+       roll    $1,%r14d
+       xorl    4(%rsp),%edx
+       movl    %r13d,%eax
+       movl    %r14d,0(%rsp)
        movl    %r12d,%ecx
-       xorl    12(%rsp),%ebp
-       xorl    %r13d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%r11,1),%r11d
-       xorl    36(%rsp),%ebp
+       xorl    12(%rsp),%edx
        xorl    %edi,%eax
+       roll    $5,%ecx
+       xorl    36(%rsp),%edx
+       leal    1859775393(%r14,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    56(%rsp),%ebp
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%ebp
-       movl    %ebp,4(%rsp)
-       movl    8(%rsp),%edx
-       movl    %r13d,%eax
+       roll    $1,%edx
+       xorl    8(%rsp),%ebp
+       movl    %r12d,%eax
+       movl    %edx,4(%rsp)
        movl    %r11d,%ecx
-       xorl    16(%rsp),%edx
-       xorl    %r12d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rbp,%rdi,1),%edi
-       xorl    40(%rsp),%edx
+       xorl    16(%rsp),%ebp
        xorl    %esi,%eax
+       roll    $5,%ecx
+       xorl    40(%rsp),%ebp
+       leal    1859775393(%rdx,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    60(%rsp),%edx
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%edx
-       movl    %edx,8(%rsp)
-       movl    12(%rsp),%ebp
-       movl    %r12d,%eax
+       roll    $1,%ebp
+       xorl    12(%rsp),%r14d
+       movl    %r11d,%eax
+       movl    %ebp,8(%rsp)
        movl    %edi,%ecx
-       xorl    20(%rsp),%ebp
-       xorl    %r11d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%rsi,1),%esi
-       xorl    44(%rsp),%ebp
+       xorl    20(%rsp),%r14d
        xorl    %r13d,%eax
+       roll    $5,%ecx
+       xorl    44(%rsp),%r14d
+       leal    1859775393(%rbp,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    0(%rsp),%ebp
        roll    $30,%r11d
        addl    %eax,%esi
-       roll    $1,%ebp
-       movl    %ebp,12(%rsp)
-       movl    16(%rsp),%edx
-       movl    %r11d,%eax
+       roll    $1,%r14d
+       xorl    16(%rsp),%edx
+       movl    %edi,%eax
+       movl    %r14d,12(%rsp)
        movl    %esi,%ecx
        xorl    24(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    1859775393(%rbp,%r13,1),%r13d
        xorl    48(%rsp),%edx
-       xorl    %r12d,%eax
+       leal    1859775393(%r14,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    4(%rsp),%edx
        roll    $30,%edi
        addl    %eax,%r13d
        roll    $1,%edx
+       xorl    20(%rsp),%ebp
+       movl    %esi,%eax
        movl    %edx,16(%rsp)
-       movl    20(%rsp),%ebp
-       movl    %edi,%eax
        movl    %r13d,%ecx
        xorl    28(%rsp),%ebp
-       xorl    %esi,%eax
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    1859775393(%rdx,%r12,1),%r12d
        xorl    52(%rsp),%ebp
-       xorl    %r11d,%eax
+       leal    1859775393(%rdx,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    8(%rsp),%ebp
        roll    $30,%esi
        addl    %eax,%r12d
        roll    $1,%ebp
+       xorl    24(%rsp),%r14d
+       movl    %r13d,%eax
        movl    %ebp,20(%rsp)
-       movl    24(%rsp),%edx
-       movl    %esi,%eax
        movl    %r12d,%ecx
-       xorl    32(%rsp),%edx
-       xorl    %r13d,%eax
+       xorl    32(%rsp),%r14d
+       xorl    %edi,%eax
        roll    $5,%ecx
+       xorl    56(%rsp),%r14d
        leal    1859775393(%rbp,%r11,1),%r11d
-       xorl    56(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    12(%rsp),%edx
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%edx
-       movl    %edx,24(%rsp)
-       movl    28(%rsp),%ebp
-       movl    %r13d,%eax
+       roll    $1,%r14d
+       xorl    28(%rsp),%edx
+       movl    %r12d,%eax
+       movl    %r14d,24(%rsp)
        movl    %r11d,%ecx
-       xorl    36(%rsp),%ebp
-       xorl    %r12d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%rdi,1),%edi
-       xorl    60(%rsp),%ebp
+       xorl    36(%rsp),%edx
        xorl    %esi,%eax
+       roll    $5,%ecx
+       xorl    60(%rsp),%edx
+       leal    1859775393(%r14,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    16(%rsp),%ebp
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%ebp
-       movl    %ebp,28(%rsp)
-       movl    32(%rsp),%edx
-       movl    %r12d,%eax
+       roll    $1,%edx
+       xorl    32(%rsp),%ebp
+       movl    %r11d,%eax
+       movl    %edx,28(%rsp)
        movl    %edi,%ecx
-       xorl    40(%rsp),%edx
-       xorl    %r11d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rbp,%rsi,1),%esi
-       xorl    0(%rsp),%edx
+       xorl    40(%rsp),%ebp
        xorl    %r13d,%eax
+       roll    $5,%ecx
+       xorl    0(%rsp),%ebp
+       leal    1859775393(%rdx,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    20(%rsp),%edx
        roll    $30,%r11d
        addl    %eax,%esi
-       roll    $1,%edx
-       movl    %edx,32(%rsp)
-       movl    36(%rsp),%ebp
-       movl    %r11d,%eax
-       movl    %r11d,%ebx
-       xorl    44(%rsp),%ebp
-       andl    %r12d,%eax
+       roll    $1,%ebp
+       xorl    36(%rsp),%r14d
+       movl    %r12d,%eax
+       movl    %ebp,32(%rsp)
+       movl    %r12d,%ebx
+       xorl    44(%rsp),%r14d
+       andl    %r11d,%eax
        movl    %esi,%ecx
-       xorl    4(%rsp),%ebp
-       xorl    %r12d,%ebx
-       leal    -1894007588(%rdx,%r13,1),%r13d
+       xorl    4(%rsp),%r14d
+       leal    -1894007588(%rbp,%r13,1),%r13d
+       xorl    %r11d,%ebx
        roll    $5,%ecx
-       xorl    24(%rsp),%ebp
        addl    %eax,%r13d
+       roll    $1,%r14d
        andl    %edi,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%r13d
-       roll    $30,%edi
-       movl    %ebp,36(%rsp)
        addl    %ecx,%r13d
-       movl    40(%rsp),%edx
-       movl    %edi,%eax
-       movl    %edi,%ebx
+       roll    $30,%edi
+       addl    %ebx,%r13d
+       xorl    40(%rsp),%edx
+       movl    %r11d,%eax
+       movl    %r14d,36(%rsp)
+       movl    %r11d,%ebx
        xorl    48(%rsp),%edx
-       andl    %r11d,%eax
+       andl    %edi,%eax
        movl    %r13d,%ecx
        xorl    8(%rsp),%edx
-       xorl    %r11d,%ebx
-       leal    -1894007588(%rbp,%r12,1),%r12d
+       leal    -1894007588(%r14,%r12,1),%r12d
+       xorl    %edi,%ebx
        roll    $5,%ecx
-       xorl    28(%rsp),%edx
        addl    %eax,%r12d
-       andl    %esi,%ebx
        roll    $1,%edx
-       addl    %ebx,%r12d
+       andl    %esi,%ebx
+       addl    %ecx,%r12d
        roll    $30,%esi
+       addl    %ebx,%r12d
+       xorl    44(%rsp),%ebp
+       movl    %edi,%eax
        movl    %edx,40(%rsp)
-       addl    %ecx,%r12d
-       movl    44(%rsp),%ebp
-       movl    %esi,%eax
-       movl    %esi,%ebx
+       movl    %edi,%ebx
        xorl    52(%rsp),%ebp
-       andl    %edi,%eax
+       andl    %esi,%eax
        movl    %r12d,%ecx
        xorl    12(%rsp),%ebp
-       xorl    %edi,%ebx
        leal    -1894007588(%rdx,%r11,1),%r11d
+       xorl    %esi,%ebx
        roll    $5,%ecx
-       xorl    32(%rsp),%ebp
        addl    %eax,%r11d
-       andl    %r13d,%ebx
        roll    $1,%ebp
-       addl    %ebx,%r11d
+       andl    %r13d,%ebx
+       addl    %ecx,%r11d
        roll    $30,%r13d
+       addl    %ebx,%r11d
+       xorl    48(%rsp),%r14d
+       movl    %esi,%eax
        movl    %ebp,44(%rsp)
-       addl    %ecx,%r11d
-       movl    48(%rsp),%edx
-       movl    %r13d,%eax
-       movl    %r13d,%ebx
-       xorl    56(%rsp),%edx
-       andl    %esi,%eax
+       movl    %esi,%ebx
+       xorl    56(%rsp),%r14d
+       andl    %r13d,%eax
        movl    %r11d,%ecx
-       xorl    16(%rsp),%edx
-       xorl    %esi,%ebx
+       xorl    16(%rsp),%r14d
        leal    -1894007588(%rbp,%rdi,1),%edi
+       xorl    %r13d,%ebx
        roll    $5,%ecx
-       xorl    36(%rsp),%edx
        addl    %eax,%edi
+       roll    $1,%r14d
        andl    %r12d,%ebx
-       roll    $1,%edx
-       addl    %ebx,%edi
-       roll    $30,%r12d
-       movl    %edx,48(%rsp)
        addl    %ecx,%edi
-       movl    52(%rsp),%ebp
-       movl    %r12d,%eax
-       movl    %r12d,%ebx
-       xorl    60(%rsp),%ebp
-       andl    %r13d,%eax
+       roll    $30,%r12d
+       addl    %ebx,%edi
+       xorl    52(%rsp),%edx
+       movl    %r13d,%eax
+       movl    %r14d,48(%rsp)
+       movl    %r13d,%ebx
+       xorl    60(%rsp),%edx
+       andl    %r12d,%eax
        movl    %edi,%ecx
-       xorl    20(%rsp),%ebp
-       xorl    %r13d,%ebx
-       leal    -1894007588(%rdx,%rsi,1),%esi
+       xorl    20(%rsp),%edx
+       leal    -1894007588(%r14,%rsi,1),%esi
+       xorl    %r12d,%ebx
        roll    $5,%ecx
-       xorl    40(%rsp),%ebp
        addl    %eax,%esi
+       roll    $1,%edx
        andl    %r11d,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%esi
-       roll    $30,%r11d
-       movl    %ebp,52(%rsp)
        addl    %ecx,%esi
-       movl    56(%rsp),%edx
-       movl    %r11d,%eax
-       movl    %r11d,%ebx
-       xorl    0(%rsp),%edx
-       andl    %r12d,%eax
+       roll    $30,%r11d
+       addl    %ebx,%esi
+       xorl    56(%rsp),%ebp
+       movl    %r12d,%eax
+       movl    %edx,52(%rsp)
+       movl    %r12d,%ebx
+       xorl    0(%rsp),%ebp
+       andl    %r11d,%eax
        movl    %esi,%ecx
-       xorl    24(%rsp),%edx
-       xorl    %r12d,%ebx
-       leal    -1894007588(%rbp,%r13,1),%r13d
+       xorl    24(%rsp),%ebp
+       leal    -1894007588(%rdx,%r13,1),%r13d
+       xorl    %r11d,%ebx
        roll    $5,%ecx
-       xorl    44(%rsp),%edx
        addl    %eax,%r13d
+       roll    $1,%ebp
        andl    %edi,%ebx
-       roll    $1,%edx
-       addl    %ebx,%r13d
-       roll    $30,%edi
-       movl    %edx,56(%rsp)
        addl    %ecx,%r13d
-       movl    60(%rsp),%ebp
-       movl    %edi,%eax
-       movl    %edi,%ebx
-       xorl    4(%rsp),%ebp
-       andl    %r11d,%eax
+       roll    $30,%edi
+       addl    %ebx,%r13d
+       xorl    60(%rsp),%r14d
+       movl    %r11d,%eax
+       movl    %ebp,56(%rsp)
+       movl    %r11d,%ebx
+       xorl    4(%rsp),%r14d
+       andl    %edi,%eax
        movl    %r13d,%ecx
-       xorl    28(%rsp),%ebp
-       xorl    %r11d,%ebx
-       leal    -1894007588(%rdx,%r12,1),%r12d
+       xorl    28(%rsp),%r14d
+       leal    -1894007588(%rbp,%r12,1),%r12d
+       xorl    %edi,%ebx
        roll    $5,%ecx
-       xorl    48(%rsp),%ebp
        addl    %eax,%r12d
+       roll    $1,%r14d
        andl    %esi,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%r12d
-       roll    $30,%esi
-       movl    %ebp,60(%rsp)
        addl    %ecx,%r12d
-       movl    0(%rsp),%edx
-       movl    %esi,%eax
-       movl    %esi,%ebx
+       roll    $30,%esi
+       addl    %ebx,%r12d
+       xorl    0(%rsp),%edx
+       movl    %edi,%eax
+       movl    %r14d,60(%rsp)
+       movl    %edi,%ebx
        xorl    8(%rsp),%edx
-       andl    %edi,%eax
+       andl    %esi,%eax
        movl    %r12d,%ecx
        xorl    32(%rsp),%edx
-       xorl    %edi,%ebx
-       leal    -1894007588(%rbp,%r11,1),%r11d
+       leal    -1894007588(%r14,%r11,1),%r11d
+       xorl    %esi,%ebx
        roll    $5,%ecx
-       xorl    52(%rsp),%edx
        addl    %eax,%r11d
-       andl    %r13d,%ebx
        roll    $1,%edx
-       addl    %ebx,%r11d
+       andl    %r13d,%ebx
+       addl    %ecx,%r11d
        roll    $30,%r13d
+       addl    %ebx,%r11d
+       xorl    4(%rsp),%ebp
+       movl    %esi,%eax
        movl    %edx,0(%rsp)
-       addl    %ecx,%r11d
-       movl    4(%rsp),%ebp
-       movl    %r13d,%eax
-       movl    %r13d,%ebx
+       movl    %esi,%ebx
        xorl    12(%rsp),%ebp
-       andl    %esi,%eax
+       andl    %r13d,%eax
        movl    %r11d,%ecx
        xorl    36(%rsp),%ebp
-       xorl    %esi,%ebx
        leal    -1894007588(%rdx,%rdi,1),%edi
+       xorl    %r13d,%ebx
        roll    $5,%ecx
-       xorl    56(%rsp),%ebp
        addl    %eax,%edi
-       andl    %r12d,%ebx
        roll    $1,%ebp
-       addl    %ebx,%edi
+       andl    %r12d,%ebx
+       addl    %ecx,%edi
        roll    $30,%r12d
+       addl    %ebx,%edi
+       xorl    8(%rsp),%r14d
+       movl    %r13d,%eax
        movl    %ebp,4(%rsp)
-       addl    %ecx,%edi
-       movl    8(%rsp),%edx
-       movl    %r12d,%eax
-       movl    %r12d,%ebx
-       xorl    16(%rsp),%edx
-       andl    %r13d,%eax
+       movl    %r13d,%ebx
+       xorl    16(%rsp),%r14d
+       andl    %r12d,%eax
        movl    %edi,%ecx
-       xorl    40(%rsp),%edx
-       xorl    %r13d,%ebx
+       xorl    40(%rsp),%r14d
        leal    -1894007588(%rbp,%rsi,1),%esi
+       xorl    %r12d,%ebx
        roll    $5,%ecx
-       xorl    60(%rsp),%edx
        addl    %eax,%esi
+       roll    $1,%r14d
        andl    %r11d,%ebx
-       roll    $1,%edx
-       addl    %ebx,%esi
-       roll    $30,%r11d
-       movl    %edx,8(%rsp)
        addl    %ecx,%esi
-       movl    12(%rsp),%ebp
-       movl    %r11d,%eax
-       movl    %r11d,%ebx
-       xorl    20(%rsp),%ebp
-       andl    %r12d,%eax
+       roll    $30,%r11d
+       addl    %ebx,%esi
+       xorl    12(%rsp),%edx
+       movl    %r12d,%eax
+       movl    %r14d,8(%rsp)
+       movl    %r12d,%ebx
+       xorl    20(%rsp),%edx
+       andl    %r11d,%eax
        movl    %esi,%ecx
-       xorl    44(%rsp),%ebp
-       xorl    %r12d,%ebx
-       leal    -1894007588(%rdx,%r13,1),%r13d
+       xorl    44(%rsp),%edx
+       leal    -1894007588(%r14,%r13,1),%r13d
+       xorl    %r11d,%ebx
        roll    $5,%ecx
-       xorl    0(%rsp),%ebp
        addl    %eax,%r13d
+       roll    $1,%edx
        andl    %edi,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%r13d
-       roll    $30,%edi
-       movl    %ebp,12(%rsp)
        addl    %ecx,%r13d
-       movl    16(%rsp),%edx
-       movl    %edi,%eax
-       movl    %edi,%ebx
-       xorl    24(%rsp),%edx
-       andl    %r11d,%eax
+       roll    $30,%edi
+       addl    %ebx,%r13d
+       xorl    16(%rsp),%ebp
+       movl    %r11d,%eax
+       movl    %edx,12(%rsp)
+       movl    %r11d,%ebx
+       xorl    24(%rsp),%ebp
+       andl    %edi,%eax
        movl    %r13d,%ecx
-       xorl    48(%rsp),%edx
-       xorl    %r11d,%ebx
-       leal    -1894007588(%rbp,%r12,1),%r12d
+       xorl    48(%rsp),%ebp
+       leal    -1894007588(%rdx,%r12,1),%r12d
+       xorl    %edi,%ebx
        roll    $5,%ecx
-       xorl    4(%rsp),%edx
        addl    %eax,%r12d
+       roll    $1,%ebp
        andl    %esi,%ebx
-       roll    $1,%edx
-       addl    %ebx,%r12d
-       roll    $30,%esi
-       movl    %edx,16(%rsp)
        addl    %ecx,%r12d
-       movl    20(%rsp),%ebp
-       movl    %esi,%eax
-       movl    %esi,%ebx
-       xorl    28(%rsp),%ebp
-       andl    %edi,%eax
+       roll    $30,%esi
+       addl    %ebx,%r12d
+       xorl    20(%rsp),%r14d
+       movl    %edi,%eax
+       movl    %ebp,16(%rsp)
+       movl    %edi,%ebx
+       xorl    28(%rsp),%r14d
+       andl    %esi,%eax
        movl    %r12d,%ecx
-       xorl    52(%rsp),%ebp
-       xorl    %edi,%ebx
-       leal    -1894007588(%rdx,%r11,1),%r11d
+       xorl    52(%rsp),%r14d
+       leal    -1894007588(%rbp,%r11,1),%r11d
+       xorl    %esi,%ebx
        roll    $5,%ecx
-       xorl    8(%rsp),%ebp
        addl    %eax,%r11d
+       roll    $1,%r14d
        andl    %r13d,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%r11d
-       roll    $30,%r13d
-       movl    %ebp,20(%rsp)
        addl    %ecx,%r11d
-       movl    24(%rsp),%edx
-       movl    %r13d,%eax
-       movl    %r13d,%ebx
+       roll    $30,%r13d
+       addl    %ebx,%r11d
+       xorl    24(%rsp),%edx
+       movl    %esi,%eax
+       movl    %r14d,20(%rsp)
+       movl    %esi,%ebx
        xorl    32(%rsp),%edx
-       andl    %esi,%eax
+       andl    %r13d,%eax
        movl    %r11d,%ecx
        xorl    56(%rsp),%edx
-       xorl    %esi,%ebx
-       leal    -1894007588(%rbp,%rdi,1),%edi
+       leal    -1894007588(%r14,%rdi,1),%edi
+       xorl    %r13d,%ebx
        roll    $5,%ecx
-       xorl    12(%rsp),%edx
        addl    %eax,%edi
-       andl    %r12d,%ebx
        roll    $1,%edx
-       addl    %ebx,%edi
+       andl    %r12d,%ebx
+       addl    %ecx,%edi
        roll    $30,%r12d
+       addl    %ebx,%edi
+       xorl    28(%rsp),%ebp
+       movl    %r13d,%eax
        movl    %edx,24(%rsp)
-       addl    %ecx,%edi
-       movl    28(%rsp),%ebp
-       movl    %r12d,%eax
-       movl    %r12d,%ebx
+       movl    %r13d,%ebx
        xorl    36(%rsp),%ebp
-       andl    %r13d,%eax
+       andl    %r12d,%eax
        movl    %edi,%ecx
        xorl    60(%rsp),%ebp
-       xorl    %r13d,%ebx
        leal    -1894007588(%rdx,%rsi,1),%esi
+       xorl    %r12d,%ebx
        roll    $5,%ecx
-       xorl    16(%rsp),%ebp
        addl    %eax,%esi
-       andl    %r11d,%ebx
        roll    $1,%ebp
-       addl    %ebx,%esi
+       andl    %r11d,%ebx
+       addl    %ecx,%esi
        roll    $30,%r11d
+       addl    %ebx,%esi
+       xorl    32(%rsp),%r14d
+       movl    %r12d,%eax
        movl    %ebp,28(%rsp)
-       addl    %ecx,%esi
-       movl    32(%rsp),%edx
-       movl    %r11d,%eax
-       movl    %r11d,%ebx
-       xorl    40(%rsp),%edx
-       andl    %r12d,%eax
+       movl    %r12d,%ebx
+       xorl    40(%rsp),%r14d
+       andl    %r11d,%eax
        movl    %esi,%ecx
-       xorl    0(%rsp),%edx
-       xorl    %r12d,%ebx
+       xorl    0(%rsp),%r14d
        leal    -1894007588(%rbp,%r13,1),%r13d
+       xorl    %r11d,%ebx
        roll    $5,%ecx
-       xorl    20(%rsp),%edx
        addl    %eax,%r13d
+       roll    $1,%r14d
        andl    %edi,%ebx
-       roll    $1,%edx
-       addl    %ebx,%r13d
-       roll    $30,%edi
-       movl    %edx,32(%rsp)
        addl    %ecx,%r13d
-       movl    36(%rsp),%ebp
-       movl    %edi,%eax
-       movl    %edi,%ebx
-       xorl    44(%rsp),%ebp
-       andl    %r11d,%eax
+       roll    $30,%edi
+       addl    %ebx,%r13d
+       xorl    36(%rsp),%edx
+       movl    %r11d,%eax
+       movl    %r14d,32(%rsp)
+       movl    %r11d,%ebx
+       xorl    44(%rsp),%edx
+       andl    %edi,%eax
        movl    %r13d,%ecx
-       xorl    4(%rsp),%ebp
-       xorl    %r11d,%ebx
-       leal    -1894007588(%rdx,%r12,1),%r12d
+       xorl    4(%rsp),%edx
+       leal    -1894007588(%r14,%r12,1),%r12d
+       xorl    %edi,%ebx
        roll    $5,%ecx
-       xorl    24(%rsp),%ebp
        addl    %eax,%r12d
+       roll    $1,%edx
        andl    %esi,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%r12d
-       roll    $30,%esi
-       movl    %ebp,36(%rsp)
        addl    %ecx,%r12d
-       movl    40(%rsp),%edx
-       movl    %esi,%eax
-       movl    %esi,%ebx
-       xorl    48(%rsp),%edx
-       andl    %edi,%eax
+       roll    $30,%esi
+       addl    %ebx,%r12d
+       xorl    40(%rsp),%ebp
+       movl    %edi,%eax
+       movl    %edx,36(%rsp)
+       movl    %edi,%ebx
+       xorl    48(%rsp),%ebp
+       andl    %esi,%eax
        movl    %r12d,%ecx
-       xorl    8(%rsp),%edx
-       xorl    %edi,%ebx
-       leal    -1894007588(%rbp,%r11,1),%r11d
+       xorl    8(%rsp),%ebp
+       leal    -1894007588(%rdx,%r11,1),%r11d
+       xorl    %esi,%ebx
        roll    $5,%ecx
-       xorl    28(%rsp),%edx
        addl    %eax,%r11d
+       roll    $1,%ebp
        andl    %r13d,%ebx
-       roll    $1,%edx
-       addl    %ebx,%r11d
-       roll    $30,%r13d
-       movl    %edx,40(%rsp)
        addl    %ecx,%r11d
-       movl    44(%rsp),%ebp
-       movl    %r13d,%eax
-       movl    %r13d,%ebx
-       xorl    52(%rsp),%ebp
-       andl    %esi,%eax
+       roll    $30,%r13d
+       addl    %ebx,%r11d
+       xorl    44(%rsp),%r14d
+       movl    %esi,%eax
+       movl    %ebp,40(%rsp)
+       movl    %esi,%ebx
+       xorl    52(%rsp),%r14d
+       andl    %r13d,%eax
        movl    %r11d,%ecx
-       xorl    12(%rsp),%ebp
-       xorl    %esi,%ebx
-       leal    -1894007588(%rdx,%rdi,1),%edi
+       xorl    12(%rsp),%r14d
+       leal    -1894007588(%rbp,%rdi,1),%edi
+       xorl    %r13d,%ebx
        roll    $5,%ecx
-       xorl    32(%rsp),%ebp
        addl    %eax,%edi
+       roll    $1,%r14d
        andl    %r12d,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%edi
-       roll    $30,%r12d
-       movl    %ebp,44(%rsp)
        addl    %ecx,%edi
-       movl    48(%rsp),%edx
-       movl    %r12d,%eax
-       movl    %r12d,%ebx
+       roll    $30,%r12d
+       addl    %ebx,%edi
+       xorl    48(%rsp),%edx
+       movl    %r13d,%eax
+       movl    %r14d,44(%rsp)
+       movl    %r13d,%ebx
        xorl    56(%rsp),%edx
-       andl    %r13d,%eax
+       andl    %r12d,%eax
        movl    %edi,%ecx
        xorl    16(%rsp),%edx
-       xorl    %r13d,%ebx
-       leal    -1894007588(%rbp,%rsi,1),%esi
+       leal    -1894007588(%r14,%rsi,1),%esi
+       xorl    %r12d,%ebx
        roll    $5,%ecx
-       xorl    36(%rsp),%edx
        addl    %eax,%esi
-       andl    %r11d,%ebx
        roll    $1,%edx
-       addl    %ebx,%esi
+       andl    %r11d,%ebx
+       addl    %ecx,%esi
        roll    $30,%r11d
+       addl    %ebx,%esi
+       xorl    52(%rsp),%ebp
+       movl    %edi,%eax
        movl    %edx,48(%rsp)
-       addl    %ecx,%esi
-       movl    52(%rsp),%ebp
-       movl    %r11d,%eax
        movl    %esi,%ecx
        xorl    60(%rsp),%ebp
-       xorl    %edi,%eax
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    -899497514(%rdx,%r13,1),%r13d
        xorl    20(%rsp),%ebp
-       xorl    %r12d,%eax
+       leal    -899497514(%rdx,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    40(%rsp),%ebp
        roll    $30,%edi
        addl    %eax,%r13d
        roll    $1,%ebp
+       xorl    56(%rsp),%r14d
+       movl    %esi,%eax
        movl    %ebp,52(%rsp)
-       movl    56(%rsp),%edx
-       movl    %edi,%eax
        movl    %r13d,%ecx
-       xorl    0(%rsp),%edx
-       xorl    %esi,%eax
+       xorl    0(%rsp),%r14d
+       xorl    %r11d,%eax
        roll    $5,%ecx
+       xorl    24(%rsp),%r14d
        leal    -899497514(%rbp,%r12,1),%r12d
-       xorl    24(%rsp),%edx
-       xorl    %r11d,%eax
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    44(%rsp),%edx
        roll    $30,%esi
        addl    %eax,%r12d
-       roll    $1,%edx
-       movl    %edx,56(%rsp)
-       movl    60(%rsp),%ebp
-       movl    %esi,%eax
+       roll    $1,%r14d
+       xorl    60(%rsp),%edx
+       movl    %r13d,%eax
+       movl    %r14d,56(%rsp)
        movl    %r12d,%ecx
-       xorl    4(%rsp),%ebp
-       xorl    %r13d,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rdx,%r11,1),%r11d
-       xorl    28(%rsp),%ebp
+       xorl    4(%rsp),%edx
        xorl    %edi,%eax
+       roll    $5,%ecx
+       xorl    28(%rsp),%edx
+       leal    -899497514(%r14,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    48(%rsp),%ebp
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%ebp
-       movl    %ebp,60(%rsp)
-       movl    0(%rsp),%edx
-       movl    %r13d,%eax
+       roll    $1,%edx
+       xorl    0(%rsp),%ebp
+       movl    %r12d,%eax
+       movl    %edx,60(%rsp)
        movl    %r11d,%ecx
-       xorl    8(%rsp),%edx
-       xorl    %r12d,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rbp,%rdi,1),%edi
-       xorl    32(%rsp),%edx
+       xorl    8(%rsp),%ebp
        xorl    %esi,%eax
+       roll    $5,%ecx
+       xorl    32(%rsp),%ebp
+       leal    -899497514(%rdx,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    52(%rsp),%edx
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%edx
-       movl    %edx,0(%rsp)
-       movl    4(%rsp),%ebp
-       movl    %r12d,%eax
+       roll    $1,%ebp
+       xorl    4(%rsp),%r14d
+       movl    %r11d,%eax
+       movl    %ebp,0(%rsp)
        movl    %edi,%ecx
-       xorl    12(%rsp),%ebp
-       xorl    %r11d,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rdx,%rsi,1),%esi
-       xorl    36(%rsp),%ebp
+       xorl    12(%rsp),%r14d
        xorl    %r13d,%eax
+       roll    $5,%ecx
+       xorl    36(%rsp),%r14d
+       leal    -899497514(%rbp,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    56(%rsp),%ebp
        roll    $30,%r11d
        addl    %eax,%esi
-       roll    $1,%ebp
-       movl    %ebp,4(%rsp)
-       movl    8(%rsp),%edx
-       movl    %r11d,%eax
+       roll    $1,%r14d
+       xorl    8(%rsp),%edx
+       movl    %edi,%eax
+       movl    %r14d,4(%rsp)
        movl    %esi,%ecx
        xorl    16(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    -899497514(%rbp,%r13,1),%r13d
        xorl    40(%rsp),%edx
-       xorl    %r12d,%eax
+       leal    -899497514(%r14,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    60(%rsp),%edx
        roll    $30,%edi
        addl    %eax,%r13d
        roll    $1,%edx
+       xorl    12(%rsp),%ebp
+       movl    %esi,%eax
        movl    %edx,8(%rsp)
-       movl    12(%rsp),%ebp
-       movl    %edi,%eax
        movl    %r13d,%ecx
        xorl    20(%rsp),%ebp
-       xorl    %esi,%eax
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    -899497514(%rdx,%r12,1),%r12d
        xorl    44(%rsp),%ebp
-       xorl    %r11d,%eax
+       leal    -899497514(%rdx,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    0(%rsp),%ebp
        roll    $30,%esi
        addl    %eax,%r12d
        roll    $1,%ebp
+       xorl    16(%rsp),%r14d
+       movl    %r13d,%eax
        movl    %ebp,12(%rsp)
-       movl    16(%rsp),%edx
-       movl    %esi,%eax
        movl    %r12d,%ecx
-       xorl    24(%rsp),%edx
-       xorl    %r13d,%eax
+       xorl    24(%rsp),%r14d
+       xorl    %edi,%eax
        roll    $5,%ecx
+       xorl    48(%rsp),%r14d
        leal    -899497514(%rbp,%r11,1),%r11d
-       xorl    48(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    4(%rsp),%edx
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%edx
-       movl    %edx,16(%rsp)
-       movl    20(%rsp),%ebp
-       movl    %r13d,%eax
+       roll    $1,%r14d
+       xorl    20(%rsp),%edx
+       movl    %r12d,%eax
+       movl    %r14d,16(%rsp)
        movl    %r11d,%ecx
-       xorl    28(%rsp),%ebp
-       xorl    %r12d,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rdx,%rdi,1),%edi
-       xorl    52(%rsp),%ebp
+       xorl    28(%rsp),%edx
        xorl    %esi,%eax
+       roll    $5,%ecx
+       xorl    52(%rsp),%edx
+       leal    -899497514(%r14,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    8(%rsp),%ebp
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%ebp
-       movl    %ebp,20(%rsp)
-       movl    24(%rsp),%edx
-       movl    %r12d,%eax
+       roll    $1,%edx
+       xorl    24(%rsp),%ebp
+       movl    %r11d,%eax
+       movl    %edx,20(%rsp)
        movl    %edi,%ecx
-       xorl    32(%rsp),%edx
-       xorl    %r11d,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rbp,%rsi,1),%esi
-       xorl    56(%rsp),%edx
+       xorl    32(%rsp),%ebp
        xorl    %r13d,%eax
+       roll    $5,%ecx
+       xorl    56(%rsp),%ebp
+       leal    -899497514(%rdx,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    12(%rsp),%edx
        roll    $30,%r11d
        addl    %eax,%esi
-       roll    $1,%edx
-       movl    %edx,24(%rsp)
-       movl    28(%rsp),%ebp
-       movl    %r11d,%eax
+       roll    $1,%ebp
+       xorl    28(%rsp),%r14d
+       movl    %edi,%eax
+       movl    %ebp,24(%rsp)
        movl    %esi,%ecx
-       xorl    36(%rsp),%ebp
-       xorl    %edi,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rdx,%r13,1),%r13d
-       xorl    60(%rsp),%ebp
+       xorl    36(%rsp),%r14d
        xorl    %r12d,%eax
+       roll    $5,%ecx
+       xorl    60(%rsp),%r14d
+       leal    -899497514(%rbp,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    16(%rsp),%ebp
        roll    $30,%edi
        addl    %eax,%r13d
-       roll    $1,%ebp
-       movl    %ebp,28(%rsp)
-       movl    32(%rsp),%edx
-       movl    %edi,%eax
+       roll    $1,%r14d
+       xorl    32(%rsp),%edx
+       movl    %esi,%eax
+       movl    %r14d,28(%rsp)
        movl    %r13d,%ecx
        xorl    40(%rsp),%edx
-       xorl    %esi,%eax
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    -899497514(%rbp,%r12,1),%r12d
        xorl    0(%rsp),%edx
-       xorl    %r11d,%eax
+       leal    -899497514(%r14,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    20(%rsp),%edx
        roll    $30,%esi
        addl    %eax,%r12d
        roll    $1,%edx
-       movl    %edx,32(%rsp)
-       movl    36(%rsp),%ebp
-       movl    %esi,%eax
+       xorl    36(%rsp),%ebp
+       movl    %r13d,%eax
+
        movl    %r12d,%ecx
        xorl    44(%rsp),%ebp
-       xorl    %r13d,%eax
+       xorl    %edi,%eax
        roll    $5,%ecx
-       leal    -899497514(%rdx,%r11,1),%r11d
        xorl    4(%rsp),%ebp
-       xorl    %edi,%eax
+       leal    -899497514(%rdx,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    24(%rsp),%ebp
        roll    $30,%r13d
        addl    %eax,%r11d
        roll    $1,%ebp
-       movl    %ebp,36(%rsp)
-       movl    40(%rsp),%edx
-       movl    %r13d,%eax
+       xorl    40(%rsp),%r14d
+       movl    %r12d,%eax
+
        movl    %r11d,%ecx
-       xorl    48(%rsp),%edx
-       xorl    %r12d,%eax
+       xorl    48(%rsp),%r14d
+       xorl    %esi,%eax
        roll    $5,%ecx
+       xorl    8(%rsp),%r14d
        leal    -899497514(%rbp,%rdi,1),%edi
-       xorl    8(%rsp),%edx
-       xorl    %esi,%eax
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    28(%rsp),%edx
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%edx
-       movl    %edx,40(%rsp)
-       movl    44(%rsp),%ebp
-       movl    %r12d,%eax
+       roll    $1,%r14d
+       xorl    44(%rsp),%edx
+       movl    %r11d,%eax
+
        movl    %edi,%ecx
-       xorl    52(%rsp),%ebp
-       xorl    %r11d,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rdx,%rsi,1),%esi
-       xorl    12(%rsp),%ebp
+       xorl    52(%rsp),%edx
        xorl    %r13d,%eax
+       roll    $5,%ecx
+       xorl    12(%rsp),%edx
+       leal    -899497514(%r14,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    32(%rsp),%ebp
        roll    $30,%r11d
        addl    %eax,%esi
-       roll    $1,%ebp
-       movl    %ebp,44(%rsp)
-       movl    48(%rsp),%edx
-       movl    %r11d,%eax
-       movl    %esi,%ecx
-       xorl    56(%rsp),%edx
-       xorl    %edi,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rbp,%r13,1),%r13d
-       xorl    16(%rsp),%edx
+       roll    $1,%edx
+       xorl    48(%rsp),%ebp
+       movl    %edi,%eax
+
+       movl    %esi,%ecx
+       xorl    56(%rsp),%ebp
        xorl    %r12d,%eax
+       roll    $5,%ecx
+       xorl    16(%rsp),%ebp
+       leal    -899497514(%rdx,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    36(%rsp),%edx
        roll    $30,%edi
        addl    %eax,%r13d
-       roll    $1,%edx
-       movl    %edx,48(%rsp)
-       movl    52(%rsp),%ebp
-       movl    %edi,%eax
+       roll    $1,%ebp
+       xorl    52(%rsp),%r14d
+       movl    %esi,%eax
+
        movl    %r13d,%ecx
-       xorl    60(%rsp),%ebp
-       xorl    %esi,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rdx,%r12,1),%r12d
-       xorl    20(%rsp),%ebp
+       xorl    60(%rsp),%r14d
        xorl    %r11d,%eax
+       roll    $5,%ecx
+       xorl    20(%rsp),%r14d
+       leal    -899497514(%rbp,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    40(%rsp),%ebp
        roll    $30,%esi
        addl    %eax,%r12d
-       roll    $1,%ebp
-       movl    56(%rsp),%edx
-       movl    %esi,%eax
+       roll    $1,%r14d
+       xorl    56(%rsp),%edx
+       movl    %r13d,%eax
+
        movl    %r12d,%ecx
        xorl    0(%rsp),%edx
-       xorl    %r13d,%eax
+       xorl    %edi,%eax
        roll    $5,%ecx
-       leal    -899497514(%rbp,%r11,1),%r11d
        xorl    24(%rsp),%edx
-       xorl    %edi,%eax
+       leal    -899497514(%r14,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    44(%rsp),%edx
        roll    $30,%r13d
        addl    %eax,%r11d
        roll    $1,%edx
-       movl    60(%rsp),%ebp
-       movl    %r13d,%eax
+       xorl    60(%rsp),%ebp
+       movl    %r12d,%eax
+
        movl    %r11d,%ecx
        xorl    4(%rsp),%ebp
-       xorl    %r12d,%eax
+       xorl    %esi,%eax
        roll    $5,%ecx
-       leal    -899497514(%rdx,%rdi,1),%edi
        xorl    28(%rsp),%ebp
-       xorl    %esi,%eax
+       leal    -899497514(%rdx,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    48(%rsp),%ebp
        roll    $30,%r12d
        addl    %eax,%edi
        roll    $1,%ebp
-       movl    %r12d,%eax
+       movl    %r11d,%eax
        movl    %edi,%ecx
-       xorl    %r11d,%eax
+       xorl    %r13d,%eax
        leal    -899497514(%rbp,%rsi,1),%esi
        roll    $5,%ecx
-       xorl    %r13d,%eax
+       xorl    %r12d,%eax
        addl    %ecx,%esi
        roll    $30,%r11d
        addl    %eax,%esi
@@ -1319,29 +1277,218 @@ sha1_block_data_order:
        jnz     .Lloop
 
        movq    64(%rsp),%rsi
-       movq    (%rsi),%r13
-       movq    8(%rsi),%r12
-       movq    16(%rsi),%rbp
-       movq    24(%rsi),%rbx
-       leaq    32(%rsi),%rsp
+.cfi_def_cfa   %rsi,8
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lepilogue:
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  sha1_block_data_order,.-sha1_block_data_order
+.type  sha1_block_data_order_shaext,@function
+.align 32
+sha1_block_data_order_shaext:
+_shaext_shortcut:
+.cfi_startproc 
+       movdqu  (%rdi),%xmm0
+       movd    16(%rdi),%xmm1
+       movdqa  K_XX_XX+160(%rip),%xmm3
+
+       movdqu  (%rsi),%xmm4
+       pshufd  $27,%xmm0,%xmm0
+       movdqu  16(%rsi),%xmm5
+       pshufd  $27,%xmm1,%xmm1
+       movdqu  32(%rsi),%xmm6
+.byte  102,15,56,0,227
+       movdqu  48(%rsi),%xmm7
+.byte  102,15,56,0,235
+.byte  102,15,56,0,243
+       movdqa  %xmm1,%xmm9
+.byte  102,15,56,0,251
+       jmp     .Loop_shaext
+
+.align 16
+.Loop_shaext:
+       decq    %rdx
+       leaq    64(%rsi),%r8
+       paddd   %xmm4,%xmm1
+       cmovneq %r8,%rsi
+       movdqa  %xmm0,%xmm8
+.byte  15,56,201,229
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,0
+.byte  15,56,200,213
+       pxor    %xmm6,%xmm4
+.byte  15,56,201,238
+.byte  15,56,202,231
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,0
+.byte  15,56,200,206
+       pxor    %xmm7,%xmm5
+.byte  15,56,202,236
+.byte  15,56,201,247
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,0
+.byte  15,56,200,215
+       pxor    %xmm4,%xmm6
+.byte  15,56,201,252
+.byte  15,56,202,245
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,0
+.byte  15,56,200,204
+       pxor    %xmm5,%xmm7
+.byte  15,56,202,254
+.byte  15,56,201,229
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,0
+.byte  15,56,200,213
+       pxor    %xmm6,%xmm4
+.byte  15,56,201,238
+.byte  15,56,202,231
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,1
+.byte  15,56,200,206
+       pxor    %xmm7,%xmm5
+.byte  15,56,202,236
+.byte  15,56,201,247
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,1
+.byte  15,56,200,215
+       pxor    %xmm4,%xmm6
+.byte  15,56,201,252
+.byte  15,56,202,245
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,1
+.byte  15,56,200,204
+       pxor    %xmm5,%xmm7
+.byte  15,56,202,254
+.byte  15,56,201,229
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,1
+.byte  15,56,200,213
+       pxor    %xmm6,%xmm4
+.byte  15,56,201,238
+.byte  15,56,202,231
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,1
+.byte  15,56,200,206
+       pxor    %xmm7,%xmm5
+.byte  15,56,202,236
+.byte  15,56,201,247
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,2
+.byte  15,56,200,215
+       pxor    %xmm4,%xmm6
+.byte  15,56,201,252
+.byte  15,56,202,245
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,2
+.byte  15,56,200,204
+       pxor    %xmm5,%xmm7
+.byte  15,56,202,254
+.byte  15,56,201,229
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,2
+.byte  15,56,200,213
+       pxor    %xmm6,%xmm4
+.byte  15,56,201,238
+.byte  15,56,202,231
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,2
+.byte  15,56,200,206
+       pxor    %xmm7,%xmm5
+.byte  15,56,202,236
+.byte  15,56,201,247
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,2
+.byte  15,56,200,215
+       pxor    %xmm4,%xmm6
+.byte  15,56,201,252
+.byte  15,56,202,245
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,3
+.byte  15,56,200,204
+       pxor    %xmm5,%xmm7
+.byte  15,56,202,254
+       movdqu  (%rsi),%xmm4
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,3
+.byte  15,56,200,213
+       movdqu  16(%rsi),%xmm5
+.byte  102,15,56,0,227
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,3
+.byte  15,56,200,206
+       movdqu  32(%rsi),%xmm6
+.byte  102,15,56,0,235
+
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,3
+.byte  15,56,200,215
+       movdqu  48(%rsi),%xmm7
+.byte  102,15,56,0,243
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,3
+.byte  65,15,56,200,201
+.byte  102,15,56,0,251
+
+       paddd   %xmm8,%xmm0
+       movdqa  %xmm1,%xmm9
+
+       jnz     .Loop_shaext
+
+       pshufd  $27,%xmm0,%xmm0
+       pshufd  $27,%xmm1,%xmm1
+       movdqu  %xmm0,(%rdi)
+       movd    %xmm1,16(%rdi)
+.cfi_endproc   
+       .byte   0xf3,0xc3
+.size  sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
 .type  sha1_block_data_order_ssse3,@function
 .align 16
 sha1_block_data_order_ssse3:
 _ssse3_shortcut:
+.cfi_startproc 
+       movq    %rsp,%r11
+.cfi_def_cfa_register  %r11
        pushq   %rbx
+.cfi_offset    %rbx,-16
        pushq   %rbp
+.cfi_offset    %rbp,-24
        pushq   %r12
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_offset    %r14,-48
        leaq    -64(%rsp),%rsp
+       andq    $-64,%rsp
        movq    %rdi,%r8
        movq    %rsi,%r9
        movq    %rdx,%r10
 
        shlq    $6,%r10
        addq    %r9,%r10
-       leaq    K_XX_XX(%rip),%r11
+       leaq    K_XX_XX+64(%rip),%r14
 
        movl    0(%r8),%eax
        movl    4(%r8),%ebx
@@ -1353,18 +1500,18 @@ _ssse3_shortcut:
        xorl    %edx,%edi
        andl    %edi,%esi
 
-       movdqa  64(%r11),%xmm6
-       movdqa  0(%r11),%xmm9
+       movdqa  64(%r14),%xmm6
+       movdqa  -64(%r14),%xmm9
        movdqu  0(%r9),%xmm0
        movdqu  16(%r9),%xmm1
        movdqu  32(%r9),%xmm2
        movdqu  48(%r9),%xmm3
 .byte  102,15,56,0,198
-       addq    $64,%r9
 .byte  102,15,56,0,206
 .byte  102,15,56,0,214
-.byte  102,15,56,0,222
+       addq    $64,%r9
        paddd   %xmm9,%xmm0
+.byte  102,15,56,0,222
        paddd   %xmm9,%xmm1
        paddd   %xmm9,%xmm2
        movdqa  %xmm0,0(%rsp)
@@ -1376,24 +1523,24 @@ _ssse3_shortcut:
        jmp     .Loop_ssse3
 .align 16
 .Loop_ssse3:
-       movdqa  %xmm1,%xmm4
        rorl    $2,%ebx
+       pshufd  $238,%xmm0,%xmm4
        xorl    %edx,%esi
        movdqa  %xmm3,%xmm8
-.byte  102,15,58,15,224,8
+       paddd   %xmm3,%xmm9
        movl    %eax,%edi
        addl    0(%rsp),%ebp
-       paddd   %xmm3,%xmm9
+       punpcklqdq      %xmm1,%xmm4
        xorl    %ecx,%ebx
        roll    $5,%eax
-       psrldq  $4,%xmm8
        addl    %esi,%ebp
+       psrldq  $4,%xmm8
        andl    %ebx,%edi
-       pxor    %xmm0,%xmm4
        xorl    %ecx,%ebx
+       pxor    %xmm0,%xmm4
        addl    %eax,%ebp
-       pxor    %xmm2,%xmm8
        rorl    $7,%eax
+       pxor    %xmm2,%xmm8
        xorl    %ecx,%edi
        movl    %ebp,%esi
        addl    4(%rsp),%edx
@@ -1404,57 +1551,57 @@ _ssse3_shortcut:
        addl    %edi,%edx
        andl    %eax,%esi
        movdqa  %xmm4,%xmm10
-       movdqa  %xmm4,%xmm8
        xorl    %ebx,%eax
        addl    %ebp,%edx
        rorl    $7,%ebp
+       movdqa  %xmm4,%xmm8
        xorl    %ebx,%esi
        pslldq  $12,%xmm10
        paddd   %xmm4,%xmm4
        movl    %edx,%edi
        addl    8(%rsp),%ecx
+       psrld   $31,%xmm8
        xorl    %eax,%ebp
        roll    $5,%edx
-       psrld   $31,%xmm8
        addl    %esi,%ecx
-       andl    %ebp,%edi
        movdqa  %xmm10,%xmm9
+       andl    %ebp,%edi
        xorl    %eax,%ebp
-       addl    %edx,%ecx
        psrld   $30,%xmm10
-       por     %xmm8,%xmm4
+       addl    %edx,%ecx
        rorl    $7,%edx
+       por     %xmm8,%xmm4
        xorl    %eax,%edi
        movl    %ecx,%esi
        addl    12(%rsp),%ebx
        pslld   $2,%xmm9
        pxor    %xmm10,%xmm4
        xorl    %ebp,%edx
+       movdqa  -64(%r14),%xmm10
        roll    $5,%ecx
-       movdqa  0(%r11),%xmm10
        addl    %edi,%ebx
        andl    %edx,%esi
        pxor    %xmm9,%xmm4
        xorl    %ebp,%edx
        addl    %ecx,%ebx
-       movdqa  %xmm2,%xmm5
        rorl    $7,%ecx
+       pshufd  $238,%xmm1,%xmm5
        xorl    %ebp,%esi
        movdqa  %xmm4,%xmm9
-.byte  102,15,58,15,233,8
+       paddd   %xmm4,%xmm10
        movl    %ebx,%edi
        addl    16(%rsp),%eax
-       paddd   %xmm4,%xmm10
+       punpcklqdq      %xmm2,%xmm5
        xorl    %edx,%ecx
        roll    $5,%ebx
-       psrldq  $4,%xmm9
        addl    %esi,%eax
+       psrldq  $4,%xmm9
        andl    %ecx,%edi
-       pxor    %xmm1,%xmm5
        xorl    %edx,%ecx
+       pxor    %xmm1,%xmm5
        addl    %ebx,%eax
-       pxor    %xmm3,%xmm9
        rorl    $7,%ebx
+       pxor    %xmm3,%xmm9
        xorl    %edx,%edi
        movl    %eax,%esi
        addl    20(%rsp),%ebp
@@ -1465,57 +1612,57 @@ _ssse3_shortcut:
        addl    %edi,%ebp
        andl    %ebx,%esi
        movdqa  %xmm5,%xmm8
-       movdqa  %xmm5,%xmm9
        xorl    %ecx,%ebx
        addl    %eax,%ebp
        rorl    $7,%eax
+       movdqa  %xmm5,%xmm9
        xorl    %ecx,%esi
        pslldq  $12,%xmm8
        paddd   %xmm5,%xmm5
        movl    %ebp,%edi
        addl    24(%rsp),%edx
+       psrld   $31,%xmm9
        xorl    %ebx,%eax
        roll    $5,%ebp
-       psrld   $31,%xmm9
        addl    %esi,%edx
-       andl    %eax,%edi
        movdqa  %xmm8,%xmm10
+       andl    %eax,%edi
        xorl    %ebx,%eax
-       addl    %ebp,%edx
        psrld   $30,%xmm8
-       por     %xmm9,%xmm5
+       addl    %ebp,%edx
        rorl    $7,%ebp
+       por     %xmm9,%xmm5
        xorl    %ebx,%edi
        movl    %edx,%esi
        addl    28(%rsp),%ecx
        pslld   $2,%xmm10
        pxor    %xmm8,%xmm5
        xorl    %eax,%ebp
+       movdqa  -32(%r14),%xmm8
        roll    $5,%edx
-       movdqa  16(%r11),%xmm8
        addl    %edi,%ecx
        andl    %ebp,%esi
        pxor    %xmm10,%xmm5
        xorl    %eax,%ebp
        addl    %edx,%ecx
-       movdqa  %xmm3,%xmm6
        rorl    $7,%edx
+       pshufd  $238,%xmm2,%xmm6
        xorl    %eax,%esi
        movdqa  %xmm5,%xmm10
-.byte  102,15,58,15,242,8
+       paddd   %xmm5,%xmm8
        movl    %ecx,%edi
        addl    32(%rsp),%ebx
-       paddd   %xmm5,%xmm8
+       punpcklqdq      %xmm3,%xmm6
        xorl    %ebp,%edx
        roll    $5,%ecx
-       psrldq  $4,%xmm10
        addl    %esi,%ebx
+       psrldq  $4,%xmm10
        andl    %edx,%edi
-       pxor    %xmm2,%xmm6
        xorl    %ebp,%edx
+       pxor    %xmm2,%xmm6
        addl    %ecx,%ebx
-       pxor    %xmm4,%xmm10
        rorl    $7,%ecx
+       pxor    %xmm4,%xmm10
        xorl    %ebp,%edi
        movl    %ebx,%esi
        addl    36(%rsp),%eax
@@ -1526,57 +1673,57 @@ _ssse3_shortcut:
        addl    %edi,%eax
        andl    %ecx,%esi
        movdqa  %xmm6,%xmm9
-       movdqa  %xmm6,%xmm10
        xorl    %edx,%ecx
        addl    %ebx,%eax
        rorl    $7,%ebx
+       movdqa  %xmm6,%xmm10
        xorl    %edx,%esi
        pslldq  $12,%xmm9
        paddd   %xmm6,%xmm6
        movl    %eax,%edi
        addl    40(%rsp),%ebp
+       psrld   $31,%xmm10
        xorl    %ecx,%ebx
        roll    $5,%eax
-       psrld   $31,%xmm10
        addl    %esi,%ebp
-       andl    %ebx,%edi
        movdqa  %xmm9,%xmm8
+       andl    %ebx,%edi
        xorl    %ecx,%ebx
-       addl    %eax,%ebp
        psrld   $30,%xmm9
-       por     %xmm10,%xmm6
+       addl    %eax,%ebp
        rorl    $7,%eax
+       por     %xmm10,%xmm6
        xorl    %ecx,%edi
        movl    %ebp,%esi
        addl    44(%rsp),%edx
        pslld   $2,%xmm8
        pxor    %xmm9,%xmm6
        xorl    %ebx,%eax
+       movdqa  -32(%r14),%xmm9
        roll    $5,%ebp
-       movdqa  16(%r11),%xmm9
        addl    %edi,%edx
        andl    %eax,%esi
        pxor    %xmm8,%xmm6
        xorl    %ebx,%eax
        addl    %ebp,%edx
-       movdqa  %xmm4,%xmm7
        rorl    $7,%ebp
+       pshufd  $238,%xmm3,%xmm7
        xorl    %ebx,%esi
        movdqa  %xmm6,%xmm8
-.byte  102,15,58,15,251,8
+       paddd   %xmm6,%xmm9
        movl    %edx,%edi
        addl    48(%rsp),%ecx
-       paddd   %xmm6,%xmm9
+       punpcklqdq      %xmm4,%xmm7
        xorl    %eax,%ebp
        roll    $5,%edx
-       psrldq  $4,%xmm8
        addl    %esi,%ecx
+       psrldq  $4,%xmm8
        andl    %ebp,%edi
-       pxor    %xmm3,%xmm7
        xorl    %eax,%ebp
+       pxor    %xmm3,%xmm7
        addl    %edx,%ecx
-       pxor    %xmm5,%xmm8
        rorl    $7,%edx
+       pxor    %xmm5,%xmm8
        xorl    %eax,%edi
        movl    %ecx,%esi
        addl    52(%rsp),%ebx
@@ -1587,78 +1734,78 @@ _ssse3_shortcut:
        addl    %edi,%ebx
        andl    %edx,%esi
        movdqa  %xmm7,%xmm10
-       movdqa  %xmm7,%xmm8
        xorl    %ebp,%edx
        addl    %ecx,%ebx
        rorl    $7,%ecx
+       movdqa  %xmm7,%xmm8
        xorl    %ebp,%esi
        pslldq  $12,%xmm10
        paddd   %xmm7,%xmm7
        movl    %ebx,%edi
        addl    56(%rsp),%eax
+       psrld   $31,%xmm8
        xorl    %edx,%ecx
        roll    $5,%ebx
-       psrld   $31,%xmm8
        addl    %esi,%eax
-       andl    %ecx,%edi
        movdqa  %xmm10,%xmm9
+       andl    %ecx,%edi
        xorl    %edx,%ecx
-       addl    %ebx,%eax
        psrld   $30,%xmm10
-       por     %xmm8,%xmm7
+       addl    %ebx,%eax
        rorl    $7,%ebx
+       por     %xmm8,%xmm7
        xorl    %edx,%edi
        movl    %eax,%esi
        addl    60(%rsp),%ebp
        pslld   $2,%xmm9
        pxor    %xmm10,%xmm7
        xorl    %ecx,%ebx
+       movdqa  -32(%r14),%xmm10
        roll    $5,%eax
-       movdqa  16(%r11),%xmm10
        addl    %edi,%ebp
        andl    %ebx,%esi
        pxor    %xmm9,%xmm7
+       pshufd  $238,%xmm6,%xmm9
        xorl    %ecx,%ebx
        addl    %eax,%ebp
-       movdqa  %xmm7,%xmm9
        rorl    $7,%eax
        pxor    %xmm4,%xmm0
-.byte  102,68,15,58,15,206,8
        xorl    %ecx,%esi
        movl    %ebp,%edi
        addl    0(%rsp),%edx
-       pxor    %xmm1,%xmm0
+       punpcklqdq      %xmm7,%xmm9
        xorl    %ebx,%eax
        roll    $5,%ebp
-       movdqa  %xmm10,%xmm8
-       paddd   %xmm7,%xmm10
+       pxor    %xmm1,%xmm0
        addl    %esi,%edx
        andl    %eax,%edi
-       pxor    %xmm9,%xmm0
+       movdqa  %xmm10,%xmm8
        xorl    %ebx,%eax
+       paddd   %xmm7,%xmm10
        addl    %ebp,%edx
+       pxor    %xmm9,%xmm0
        rorl    $7,%ebp
        xorl    %ebx,%edi
-       movdqa  %xmm0,%xmm9
-       movdqa  %xmm10,48(%rsp)
        movl    %edx,%esi
        addl    4(%rsp),%ecx
+       movdqa  %xmm0,%xmm9
        xorl    %eax,%ebp
        roll    $5,%edx
-       pslld   $2,%xmm0
+       movdqa  %xmm10,48(%rsp)
        addl    %edi,%ecx
        andl    %ebp,%esi
-       psrld   $30,%xmm9
        xorl    %eax,%ebp
+       pslld   $2,%xmm0
        addl    %edx,%ecx
        rorl    $7,%edx
+       psrld   $30,%xmm9
        xorl    %eax,%esi
        movl    %ecx,%edi
        addl    8(%rsp),%ebx
        por     %xmm9,%xmm0
        xorl    %ebp,%edx
        roll    $5,%ecx
-       movdqa  %xmm0,%xmm10
+       pshufd  $238,%xmm7,%xmm10
        addl    %esi,%ebx
        andl    %edx,%edi
        xorl    %ebp,%edx
@@ -1671,18 +1818,18 @@ _ssse3_shortcut:
        xorl    %edx,%esi
        rorl    $7,%ecx
        addl    %ebx,%eax
-       addl    16(%rsp),%ebp
        pxor    %xmm5,%xmm1
-.byte  102,68,15,58,15,215,8
+       addl    16(%rsp),%ebp
        xorl    %ecx,%esi
+       punpcklqdq      %xmm0,%xmm10
        movl    %eax,%edi
        roll    $5,%eax
        pxor    %xmm2,%xmm1
        addl    %esi,%ebp
        xorl    %ecx,%edi
        movdqa  %xmm8,%xmm9
-       paddd   %xmm0,%xmm8
        rorl    $7,%ebx
+       paddd   %xmm0,%xmm8
        addl    %eax,%ebp
        pxor    %xmm10,%xmm1
        addl    20(%rsp),%edx
@@ -1690,43 +1837,43 @@ _ssse3_shortcut:
        movl    %ebp,%esi
        roll    $5,%ebp
        movdqa  %xmm1,%xmm10
-       movdqa  %xmm8,0(%rsp)
        addl    %edi,%edx
        xorl    %ebx,%esi
+       movdqa  %xmm8,0(%rsp)
        rorl    $7,%eax
        addl    %ebp,%edx
-       pslld   $2,%xmm1
        addl    24(%rsp),%ecx
+       pslld   $2,%xmm1
        xorl    %eax,%esi
-       psrld   $30,%xmm10
        movl    %edx,%edi
+       psrld   $30,%xmm10
        roll    $5,%edx
        addl    %esi,%ecx
        xorl    %eax,%edi
        rorl    $7,%ebp
-       addl    %edx,%ecx
        por     %xmm10,%xmm1
+       addl    %edx,%ecx
        addl    28(%rsp),%ebx
+       pshufd  $238,%xmm0,%xmm8
        xorl    %ebp,%edi
-       movdqa  %xmm1,%xmm8
        movl    %ecx,%esi
        roll    $5,%ecx
        addl    %edi,%ebx
        xorl    %ebp,%esi
        rorl    $7,%edx
        addl    %ecx,%ebx
-       addl    32(%rsp),%eax
        pxor    %xmm6,%xmm2
-.byte  102,68,15,58,15,192,8
+       addl    32(%rsp),%eax
        xorl    %edx,%esi
+       punpcklqdq      %xmm1,%xmm8
        movl    %ebx,%edi
        roll    $5,%ebx
        pxor    %xmm3,%xmm2
        addl    %esi,%eax
        xorl    %edx,%edi
-       movdqa  32(%r11),%xmm10
-       paddd   %xmm1,%xmm9
+       movdqa  0(%r14),%xmm10
        rorl    $7,%ecx
+       paddd   %xmm1,%xmm9
        addl    %ebx,%eax
        pxor    %xmm8,%xmm2
        addl    36(%rsp),%ebp
@@ -1734,43 +1881,43 @@ _ssse3_shortcut:
        movl    %eax,%esi
        roll    $5,%eax
        movdqa  %xmm2,%xmm8
-       movdqa  %xmm9,16(%rsp)
        addl    %edi,%ebp
        xorl    %ecx,%esi
+       movdqa  %xmm9,16(%rsp)
        rorl    $7,%ebx
        addl    %eax,%ebp
-       pslld   $2,%xmm2
        addl    40(%rsp),%edx
+       pslld   $2,%xmm2
        xorl    %ebx,%esi
-       psrld   $30,%xmm8
        movl    %ebp,%edi
+       psrld   $30,%xmm8
        roll    $5,%ebp
        addl    %esi,%edx
        xorl    %ebx,%edi
        rorl    $7,%eax
-       addl    %ebp,%edx
        por     %xmm8,%xmm2
+       addl    %ebp,%edx
        addl    44(%rsp),%ecx
+       pshufd  $238,%xmm1,%xmm9
        xorl    %eax,%edi
-       movdqa  %xmm2,%xmm9
        movl    %edx,%esi
        roll    $5,%edx
        addl    %edi,%ecx
        xorl    %eax,%esi
        rorl    $7,%ebp
        addl    %edx,%ecx
-       addl    48(%rsp),%ebx
        pxor    %xmm7,%xmm3
-.byte  102,68,15,58,15,201,8
+       addl    48(%rsp),%ebx
        xorl    %ebp,%esi
+       punpcklqdq      %xmm2,%xmm9
        movl    %ecx,%edi
        roll    $5,%ecx
        pxor    %xmm4,%xmm3
        addl    %esi,%ebx
        xorl    %ebp,%edi
        movdqa  %xmm10,%xmm8
-       paddd   %xmm2,%xmm10
        rorl    $7,%edx
+       paddd   %xmm2,%xmm10
        addl    %ecx,%ebx
        pxor    %xmm9,%xmm3
        addl    52(%rsp),%eax
@@ -1778,43 +1925,43 @@ _ssse3_shortcut:
        movl    %ebx,%esi
        roll    $5,%ebx
        movdqa  %xmm3,%xmm9
-       movdqa  %xmm10,32(%rsp)
        addl    %edi,%eax
        xorl    %edx,%esi
+       movdqa  %xmm10,32(%rsp)
        rorl    $7,%ecx
        addl    %ebx,%eax
-       pslld   $2,%xmm3
        addl    56(%rsp),%ebp
+       pslld   $2,%xmm3
        xorl    %ecx,%esi
-       psrld   $30,%xmm9
        movl    %eax,%edi
+       psrld   $30,%xmm9
        roll    $5,%eax
        addl    %esi,%ebp
        xorl    %ecx,%edi
        rorl    $7,%ebx
-       addl    %eax,%ebp
        por     %xmm9,%xmm3
+       addl    %eax,%ebp
        addl    60(%rsp),%edx
+       pshufd  $238,%xmm2,%xmm10
        xorl    %ebx,%edi
-       movdqa  %xmm3,%xmm10
        movl    %ebp,%esi
        roll    $5,%ebp
        addl    %edi,%edx
        xorl    %ebx,%esi
        rorl    $7,%eax
        addl    %ebp,%edx
-       addl    0(%rsp),%ecx
        pxor    %xmm0,%xmm4
-.byte  102,68,15,58,15,210,8
+       addl    0(%rsp),%ecx
        xorl    %eax,%esi
+       punpcklqdq      %xmm3,%xmm10
        movl    %edx,%edi
        roll    $5,%edx
        pxor    %xmm5,%xmm4
        addl    %esi,%ecx
        xorl    %eax,%edi
        movdqa  %xmm8,%xmm9
-       paddd   %xmm3,%xmm8
        rorl    $7,%ebp
+       paddd   %xmm3,%xmm8
        addl    %edx,%ecx
        pxor    %xmm10,%xmm4
        addl    4(%rsp),%ebx
@@ -1822,43 +1969,43 @@ _ssse3_shortcut:
        movl    %ecx,%esi
        roll    $5,%ecx
        movdqa  %xmm4,%xmm10
-       movdqa  %xmm8,48(%rsp)
        addl    %edi,%ebx
        xorl    %ebp,%esi
+       movdqa  %xmm8,48(%rsp)
        rorl    $7,%edx
        addl    %ecx,%ebx
-       pslld   $2,%xmm4
        addl    8(%rsp),%eax
+       pslld   $2,%xmm4
        xorl    %edx,%esi
-       psrld   $30,%xmm10
        movl    %ebx,%edi
+       psrld   $30,%xmm10
        roll    $5,%ebx
        addl    %esi,%eax
        xorl    %edx,%edi
        rorl    $7,%ecx
-       addl    %ebx,%eax
        por     %xmm10,%xmm4
+       addl    %ebx,%eax
        addl    12(%rsp),%ebp
+       pshufd  $238,%xmm3,%xmm8
        xorl    %ecx,%edi
-       movdqa  %xmm4,%xmm8
        movl    %eax,%esi
        roll    $5,%eax
        addl    %edi,%ebp
        xorl    %ecx,%esi
        rorl    $7,%ebx
        addl    %eax,%ebp
-       addl    16(%rsp),%edx
        pxor    %xmm1,%xmm5
-.byte  102,68,15,58,15,195,8
+       addl    16(%rsp),%edx
        xorl    %ebx,%esi
+       punpcklqdq      %xmm4,%xmm8
        movl    %ebp,%edi
        roll    $5,%ebp
        pxor    %xmm6,%xmm5
        addl    %esi,%edx
        xorl    %ebx,%edi
        movdqa  %xmm9,%xmm10
-       paddd   %xmm4,%xmm9
        rorl    $7,%eax
+       paddd   %xmm4,%xmm9
        addl    %ebp,%edx
        pxor    %xmm8,%xmm5
        addl    20(%rsp),%ecx
@@ -1866,24 +2013,24 @@ _ssse3_shortcut:
        movl    %edx,%esi
        roll    $5,%edx
        movdqa  %xmm5,%xmm8
-       movdqa  %xmm9,0(%rsp)
        addl    %edi,%ecx
        xorl    %eax,%esi
+       movdqa  %xmm9,0(%rsp)
        rorl    $7,%ebp
        addl    %edx,%ecx
-       pslld   $2,%xmm5
        addl    24(%rsp),%ebx
+       pslld   $2,%xmm5
        xorl    %ebp,%esi
-       psrld   $30,%xmm8
        movl    %ecx,%edi
+       psrld   $30,%xmm8
        roll    $5,%ecx
        addl    %esi,%ebx
        xorl    %ebp,%edi
        rorl    $7,%edx
-       addl    %ecx,%ebx
        por     %xmm8,%xmm5
+       addl    %ecx,%ebx
        addl    28(%rsp),%eax
-       movdqa  %xmm5,%xmm9
+       pshufd  $238,%xmm4,%xmm9
        rorl    $7,%ecx
        movl    %ebx,%esi
        xorl    %edx,%edi
@@ -1892,47 +2039,47 @@ _ssse3_shortcut:
        xorl    %ecx,%esi
        xorl    %edx,%ecx
        addl    %ebx,%eax
-       addl    32(%rsp),%ebp
        pxor    %xmm2,%xmm6
-.byte  102,68,15,58,15,204,8
+       addl    32(%rsp),%ebp
        andl    %ecx,%esi
        xorl    %edx,%ecx
        rorl    $7,%ebx
-       pxor    %xmm7,%xmm6
+       punpcklqdq      %xmm5,%xmm9
        movl    %eax,%edi
        xorl    %ecx,%esi
-       movdqa  %xmm10,%xmm8
-       paddd   %xmm5,%xmm10
+       pxor    %xmm7,%xmm6
        roll    $5,%eax
        addl    %esi,%ebp
-       pxor    %xmm9,%xmm6
+       movdqa  %xmm10,%xmm8
        xorl    %ebx,%edi
+       paddd   %xmm5,%xmm10
        xorl    %ecx,%ebx
+       pxor    %xmm9,%xmm6
        addl    %eax,%ebp
        addl    36(%rsp),%edx
-       movdqa  %xmm6,%xmm9
-       movdqa  %xmm10,16(%rsp)
        andl    %ebx,%edi
        xorl    %ecx,%ebx
        rorl    $7,%eax
+       movdqa  %xmm6,%xmm9
        movl    %ebp,%esi
-       pslld   $2,%xmm6
        xorl    %ebx,%edi
+       movdqa  %xmm10,16(%rsp)
        roll    $5,%ebp
-       psrld   $30,%xmm9
        addl    %edi,%edx
        xorl    %eax,%esi
+       pslld   $2,%xmm6
        xorl    %ebx,%eax
        addl    %ebp,%edx
+       psrld   $30,%xmm9
        addl    40(%rsp),%ecx
        andl    %eax,%esi
-       por     %xmm9,%xmm6
        xorl    %ebx,%eax
+       por     %xmm9,%xmm6
        rorl    $7,%ebp
-       movdqa  %xmm6,%xmm10
        movl    %edx,%edi
        xorl    %eax,%esi
        roll    $5,%edx
+       pshufd  $238,%xmm5,%xmm10
        addl    %esi,%ecx
        xorl    %ebp,%edi
        xorl    %eax,%ebp
@@ -1948,47 +2095,47 @@ _ssse3_shortcut:
        xorl    %edx,%esi
        xorl    %ebp,%edx
        addl    %ecx,%ebx
-       addl    48(%rsp),%eax
        pxor    %xmm3,%xmm7
-.byte  102,68,15,58,15,213,8
+       addl    48(%rsp),%eax
        andl    %edx,%esi
        xorl    %ebp,%edx
        rorl    $7,%ecx
-       pxor    %xmm0,%xmm7
+       punpcklqdq      %xmm6,%xmm10
        movl    %ebx,%edi
        xorl    %edx,%esi
-       movdqa  48(%r11),%xmm9
-       paddd   %xmm6,%xmm8
+       pxor    %xmm0,%xmm7
        roll    $5,%ebx
        addl    %esi,%eax
-       pxor    %xmm10,%xmm7
+       movdqa  32(%r14),%xmm9
        xorl    %ecx,%edi
+       paddd   %xmm6,%xmm8
        xorl    %edx,%ecx
+       pxor    %xmm10,%xmm7
        addl    %ebx,%eax
        addl    52(%rsp),%ebp
-       movdqa  %xmm7,%xmm10
-       movdqa  %xmm8,32(%rsp)
        andl    %ecx,%edi
        xorl    %edx,%ecx
        rorl    $7,%ebx
+       movdqa  %xmm7,%xmm10
        movl    %eax,%esi
-       pslld   $2,%xmm7
        xorl    %ecx,%edi
+       movdqa  %xmm8,32(%rsp)
        roll    $5,%eax
-       psrld   $30,%xmm10
        addl    %edi,%ebp
        xorl    %ebx,%esi
+       pslld   $2,%xmm7
        xorl    %ecx,%ebx
        addl    %eax,%ebp
+       psrld   $30,%xmm10
        addl    56(%rsp),%edx
        andl    %ebx,%esi
-       por     %xmm10,%xmm7
        xorl    %ecx,%ebx
+       por     %xmm10,%xmm7
        rorl    $7,%eax
-       movdqa  %xmm7,%xmm8
        movl    %ebp,%edi
        xorl    %ebx,%esi
        roll    $5,%ebp
+       pshufd  $238,%xmm6,%xmm8
        addl    %esi,%edx
        xorl    %eax,%edi
        xorl    %ebx,%eax
@@ -2004,47 +2151,47 @@ _ssse3_shortcut:
        xorl    %ebp,%esi
        xorl    %eax,%ebp
        addl    %edx,%ecx
-       addl    0(%rsp),%ebx
        pxor    %xmm4,%xmm0
-.byte  102,68,15,58,15,198,8
+       addl    0(%rsp),%ebx
        andl    %ebp,%esi
        xorl    %eax,%ebp
        rorl    $7,%edx
-       pxor    %xmm1,%xmm0
+       punpcklqdq      %xmm7,%xmm8
        movl    %ecx,%edi
        xorl    %ebp,%esi
-       movdqa  %xmm9,%xmm10
-       paddd   %xmm7,%xmm9
+       pxor    %xmm1,%xmm0
        roll    $5,%ecx
        addl    %esi,%ebx
-       pxor    %xmm8,%xmm0
+       movdqa  %xmm9,%xmm10
        xorl    %edx,%edi
+       paddd   %xmm7,%xmm9
        xorl    %ebp,%edx
+       pxor    %xmm8,%xmm0
        addl    %ecx,%ebx
        addl    4(%rsp),%eax
-       movdqa  %xmm0,%xmm8
-       movdqa  %xmm9,48(%rsp)
        andl    %edx,%edi
        xorl    %ebp,%edx
        rorl    $7,%ecx
+       movdqa  %xmm0,%xmm8
        movl    %ebx,%esi
-       pslld   $2,%xmm0
        xorl    %edx,%edi
+       movdqa  %xmm9,48(%rsp)
        roll    $5,%ebx
-       psrld   $30,%xmm8
        addl    %edi,%eax
        xorl    %ecx,%esi
+       pslld   $2,%xmm0
        xorl    %edx,%ecx
        addl    %ebx,%eax
+       psrld   $30,%xmm8
        addl    8(%rsp),%ebp
        andl    %ecx,%esi
-       por     %xmm8,%xmm0
        xorl    %edx,%ecx
+       por     %xmm8,%xmm0
        rorl    $7,%ebx
-       movdqa  %xmm0,%xmm9
        movl    %eax,%edi
        xorl    %ecx,%esi
        roll    $5,%eax
+       pshufd  $238,%xmm7,%xmm9
        addl    %esi,%ebp
        xorl    %ebx,%edi
        xorl    %ecx,%ebx
@@ -2060,47 +2207,47 @@ _ssse3_shortcut:
        xorl    %eax,%esi
        xorl    %ebx,%eax
        addl    %ebp,%edx
-       addl    16(%rsp),%ecx
        pxor    %xmm5,%xmm1
-.byte  102,68,15,58,15,207,8
+       addl    16(%rsp),%ecx
        andl    %eax,%esi
        xorl    %ebx,%eax
        rorl    $7,%ebp
-       pxor    %xmm2,%xmm1
+       punpcklqdq      %xmm0,%xmm9
        movl    %edx,%edi
        xorl    %eax,%esi
-       movdqa  %xmm10,%xmm8
-       paddd   %xmm0,%xmm10
+       pxor    %xmm2,%xmm1
        roll    $5,%edx
        addl    %esi,%ecx
-       pxor    %xmm9,%xmm1
+       movdqa  %xmm10,%xmm8
        xorl    %ebp,%edi
+       paddd   %xmm0,%xmm10
        xorl    %eax,%ebp
+       pxor    %xmm9,%xmm1
        addl    %edx,%ecx
        addl    20(%rsp),%ebx
-       movdqa  %xmm1,%xmm9
-       movdqa  %xmm10,0(%rsp)
        andl    %ebp,%edi
        xorl    %eax,%ebp
        rorl    $7,%edx
+       movdqa  %xmm1,%xmm9
        movl    %ecx,%esi
-       pslld   $2,%xmm1
        xorl    %ebp,%edi
+       movdqa  %xmm10,0(%rsp)
        roll    $5,%ecx
-       psrld   $30,%xmm9
        addl    %edi,%ebx
        xorl    %edx,%esi
+       pslld   $2,%xmm1
        xorl    %ebp,%edx
        addl    %ecx,%ebx
+       psrld   $30,%xmm9
        addl    24(%rsp),%eax
        andl    %edx,%esi
-       por     %xmm9,%xmm1
        xorl    %ebp,%edx
+       por     %xmm9,%xmm1
        rorl    $7,%ecx
-       movdqa  %xmm1,%xmm10
        movl    %ebx,%edi
        xorl    %edx,%esi
        roll    $5,%ebx
+       pshufd  $238,%xmm0,%xmm10
        addl    %esi,%eax
        xorl    %ecx,%edi
        xorl    %edx,%ecx
@@ -2116,47 +2263,47 @@ _ssse3_shortcut:
        xorl    %ebx,%esi
        xorl    %ecx,%ebx
        addl    %eax,%ebp
-       addl    32(%rsp),%edx
        pxor    %xmm6,%xmm2
-.byte  102,68,15,58,15,208,8
+       addl    32(%rsp),%edx
        andl    %ebx,%esi
        xorl    %ecx,%ebx
        rorl    $7,%eax
-       pxor    %xmm3,%xmm2
+       punpcklqdq      %xmm1,%xmm10
        movl    %ebp,%edi
        xorl    %ebx,%esi
-       movdqa  %xmm8,%xmm9
-       paddd   %xmm1,%xmm8
+       pxor    %xmm3,%xmm2
        roll    $5,%ebp
        addl    %esi,%edx
-       pxor    %xmm10,%xmm2
+       movdqa  %xmm8,%xmm9
        xorl    %eax,%edi
+       paddd   %xmm1,%xmm8
        xorl    %ebx,%eax
+       pxor    %xmm10,%xmm2
        addl    %ebp,%edx
        addl    36(%rsp),%ecx
-       movdqa  %xmm2,%xmm10
-       movdqa  %xmm8,16(%rsp)
        andl    %eax,%edi
        xorl    %ebx,%eax
        rorl    $7,%ebp
+       movdqa  %xmm2,%xmm10
        movl    %edx,%esi
-       pslld   $2,%xmm2
        xorl    %eax,%edi
+       movdqa  %xmm8,16(%rsp)
        roll    $5,%edx
-       psrld   $30,%xmm10
        addl    %edi,%ecx
        xorl    %ebp,%esi
+       pslld   $2,%xmm2
        xorl    %eax,%ebp
        addl    %edx,%ecx
+       psrld   $30,%xmm10
        addl    40(%rsp),%ebx
        andl    %ebp,%esi
-       por     %xmm10,%xmm2
        xorl    %eax,%ebp
+       por     %xmm10,%xmm2
        rorl    $7,%edx
-       movdqa  %xmm2,%xmm8
        movl    %ecx,%edi
        xorl    %ebp,%esi
        roll    $5,%ecx
+       pshufd  $238,%xmm1,%xmm8
        addl    %esi,%ebx
        xorl    %edx,%edi
        xorl    %ebp,%edx
@@ -2171,18 +2318,18 @@ _ssse3_shortcut:
        addl    %edi,%eax
        xorl    %edx,%esi
        addl    %ebx,%eax
-       addl    48(%rsp),%ebp
        pxor    %xmm7,%xmm3
-.byte  102,68,15,58,15,193,8
+       addl    48(%rsp),%ebp
        xorl    %ecx,%esi
+       punpcklqdq      %xmm2,%xmm8
        movl    %eax,%edi
        roll    $5,%eax
        pxor    %xmm4,%xmm3
        addl    %esi,%ebp
        xorl    %ecx,%edi
        movdqa  %xmm9,%xmm10
-       paddd   %xmm2,%xmm9
        rorl    $7,%ebx
+       paddd   %xmm2,%xmm9
        addl    %eax,%ebp
        pxor    %xmm8,%xmm3
        addl    52(%rsp),%edx
@@ -2190,22 +2337,22 @@ _ssse3_shortcut:
        movl    %ebp,%esi
        roll    $5,%ebp
        movdqa  %xmm3,%xmm8
-       movdqa  %xmm9,32(%rsp)
        addl    %edi,%edx
        xorl    %ebx,%esi
+       movdqa  %xmm9,32(%rsp)
        rorl    $7,%eax
        addl    %ebp,%edx
-       pslld   $2,%xmm3
        addl    56(%rsp),%ecx
+       pslld   $2,%xmm3
        xorl    %eax,%esi
-       psrld   $30,%xmm8
        movl    %edx,%edi
+       psrld   $30,%xmm8
        roll    $5,%edx
        addl    %esi,%ecx
        xorl    %eax,%edi
        rorl    $7,%ebp
-       addl    %edx,%ecx
        por     %xmm8,%xmm3
+       addl    %edx,%ecx
        addl    60(%rsp),%ebx
        xorl    %ebp,%edi
        movl    %ecx,%esi
@@ -2215,13 +2362,13 @@ _ssse3_shortcut:
        rorl    $7,%edx
        addl    %ecx,%ebx
        addl    0(%rsp),%eax
-       paddd   %xmm3,%xmm10
        xorl    %edx,%esi
        movl    %ebx,%edi
        roll    $5,%ebx
+       paddd   %xmm3,%xmm10
        addl    %esi,%eax
-       movdqa  %xmm10,48(%rsp)
        xorl    %edx,%edi
+       movdqa  %xmm10,48(%rsp)
        rorl    $7,%ecx
        addl    %ebx,%eax
        addl    4(%rsp),%ebp
@@ -2250,8 +2397,8 @@ _ssse3_shortcut:
        addl    %edx,%ecx
        cmpq    %r10,%r9
        je      .Ldone_ssse3
-       movdqa  64(%r11),%xmm6
-       movdqa  0(%r11),%xmm9
+       movdqa  64(%r14),%xmm6
+       movdqa  -64(%r14),%xmm9
        movdqu  0(%r9),%xmm0
        movdqu  16(%r9),%xmm1
        movdqu  32(%r9),%xmm2
@@ -2260,23 +2407,23 @@ _ssse3_shortcut:
        addq    $64,%r9
        addl    16(%rsp),%ebx
        xorl    %ebp,%esi
-.byte  102,15,56,0,206
        movl    %ecx,%edi
+.byte  102,15,56,0,206
        roll    $5,%ecx
-       paddd   %xmm9,%xmm0
        addl    %esi,%ebx
        xorl    %ebp,%edi
        rorl    $7,%edx
+       paddd   %xmm9,%xmm0
        addl    %ecx,%ebx
-       movdqa  %xmm0,0(%rsp)
        addl    20(%rsp),%eax
        xorl    %edx,%edi
-       psubd   %xmm9,%xmm0
        movl    %ebx,%esi
+       movdqa  %xmm0,0(%rsp)
        roll    $5,%ebx
        addl    %edi,%eax
        xorl    %edx,%esi
        rorl    $7,%ecx
+       psubd   %xmm9,%xmm0
        addl    %ebx,%eax
        addl    24(%rsp),%ebp
        xorl    %ecx,%esi
@@ -2296,23 +2443,23 @@ _ssse3_shortcut:
        addl    %ebp,%edx
        addl    32(%rsp),%ecx
        xorl    %eax,%esi
-.byte  102,15,56,0,214
        movl    %edx,%edi
+.byte  102,15,56,0,214
        roll    $5,%edx
-       paddd   %xmm9,%xmm1
        addl    %esi,%ecx
        xorl    %eax,%edi
        rorl    $7,%ebp
+       paddd   %xmm9,%xmm1
        addl    %edx,%ecx
-       movdqa  %xmm1,16(%rsp)
        addl    36(%rsp),%ebx
        xorl    %ebp,%edi
-       psubd   %xmm9,%xmm1
        movl    %ecx,%esi
+       movdqa  %xmm1,16(%rsp)
        roll    $5,%ecx
        addl    %edi,%ebx
        xorl    %ebp,%esi
        rorl    $7,%edx
+       psubd   %xmm9,%xmm1
        addl    %ecx,%ebx
        addl    40(%rsp),%eax
        xorl    %edx,%esi
@@ -2332,23 +2479,23 @@ _ssse3_shortcut:
        addl    %eax,%ebp
        addl    48(%rsp),%edx
        xorl    %ebx,%esi
-.byte  102,15,56,0,222
        movl    %ebp,%edi
+.byte  102,15,56,0,222
        roll    $5,%ebp
-       paddd   %xmm9,%xmm2
        addl    %esi,%edx
        xorl    %ebx,%edi
        rorl    $7,%eax
+       paddd   %xmm9,%xmm2
        addl    %ebp,%edx
-       movdqa  %xmm2,32(%rsp)
        addl    52(%rsp),%ecx
        xorl    %eax,%edi
-       psubd   %xmm9,%xmm2
        movl    %edx,%esi
+       movdqa  %xmm2,32(%rsp)
        roll    $5,%edx
        addl    %edi,%ecx
        xorl    %eax,%esi
        rorl    $7,%ebp
+       psubd   %xmm9,%xmm2
        addl    %edx,%ecx
        addl    56(%rsp),%ebx
        xorl    %ebp,%esi
@@ -2488,25 +2635,2857 @@ _ssse3_shortcut:
        movl    %ecx,8(%r8)
        movl    %edx,12(%r8)
        movl    %ebp,16(%r8)
-       leaq    64(%rsp),%rsi
-       movq    0(%rsi),%r12
-       movq    8(%rsi),%rbp
-       movq    16(%rsi),%rbx
-       leaq    24(%rsi),%rsp
+       movq    -40(%r11),%r14
+.cfi_restore   %r14
+       movq    -32(%r11),%r13
+.cfi_restore   %r13
+       movq    -24(%r11),%r12
+.cfi_restore   %r12
+       movq    -16(%r11),%rbp
+.cfi_restore   %rbp
+       movq    -8(%r11),%rbx
+.cfi_restore   %rbx
+       leaq    (%r11),%rsp
+.cfi_def_cfa_register  %rsp
 .Lepilogue_ssse3:
        .byte   0xf3,0xc3
+.cfi_endproc   
 .size  sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
-.align 64
-K_XX_XX:
-.long  0x5a827999,0x5a827999,0x5a827999,0x5a827999     
-.long  0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     
-.long  0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     
-.long  0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     
-.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     
-.byte  83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 64
-
+.type  sha1_block_data_order_avx,@function
+.align 16
+sha1_block_data_order_avx:
+_avx_shortcut:
+.cfi_startproc 
+       movq    %rsp,%r11
+.cfi_def_cfa_register  %r11
+       pushq   %rbx
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_offset    %r14,-48
+       leaq    -64(%rsp),%rsp
+       vzeroupper
+       andq    $-64,%rsp
+       movq    %rdi,%r8
+       movq    %rsi,%r9
+       movq    %rdx,%r10
 
-.section .note.GNU-stack,"",%progbits
+       shlq    $6,%r10
+       addq    %r9,%r10
+       leaq    K_XX_XX+64(%rip),%r14
 
+       movl    0(%r8),%eax
+       movl    4(%r8),%ebx
+       movl    8(%r8),%ecx
+       movl    12(%r8),%edx
+       movl    %ebx,%esi
+       movl    16(%r8),%ebp
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       andl    %edi,%esi
 
+       vmovdqa 64(%r14),%xmm6
+       vmovdqa -64(%r14),%xmm11
+       vmovdqu 0(%r9),%xmm0
+       vmovdqu 16(%r9),%xmm1
+       vmovdqu 32(%r9),%xmm2
+       vmovdqu 48(%r9),%xmm3
+       vpshufb %xmm6,%xmm0,%xmm0
+       addq    $64,%r9
+       vpshufb %xmm6,%xmm1,%xmm1
+       vpshufb %xmm6,%xmm2,%xmm2
+       vpshufb %xmm6,%xmm3,%xmm3
+       vpaddd  %xmm11,%xmm0,%xmm4
+       vpaddd  %xmm11,%xmm1,%xmm5
+       vpaddd  %xmm11,%xmm2,%xmm6
+       vmovdqa %xmm4,0(%rsp)
+       vmovdqa %xmm5,16(%rsp)
+       vmovdqa %xmm6,32(%rsp)
+       jmp     .Loop_avx
+.align 16
+.Loop_avx:
+       shrdl   $2,%ebx,%ebx
+       xorl    %edx,%esi
+       vpalignr        $8,%xmm0,%xmm1,%xmm4
+       movl    %eax,%edi
+       addl    0(%rsp),%ebp
+       vpaddd  %xmm3,%xmm11,%xmm9
+       xorl    %ecx,%ebx
+       shldl   $5,%eax,%eax
+       vpsrldq $4,%xmm3,%xmm8
+       addl    %esi,%ebp
+       andl    %ebx,%edi
+       vpxor   %xmm0,%xmm4,%xmm4
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       vpxor   %xmm2,%xmm8,%xmm8
+       shrdl   $7,%eax,%eax
+       xorl    %ecx,%edi
+       movl    %ebp,%esi
+       addl    4(%rsp),%edx
+       vpxor   %xmm8,%xmm4,%xmm4
+       xorl    %ebx,%eax
+       shldl   $5,%ebp,%ebp
+       vmovdqa %xmm9,48(%rsp)
+       addl    %edi,%edx
+       andl    %eax,%esi
+       vpsrld  $31,%xmm4,%xmm8
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       shrdl   $7,%ebp,%ebp
+       xorl    %ebx,%esi
+       vpslldq $12,%xmm4,%xmm10
+       vpaddd  %xmm4,%xmm4,%xmm4
+       movl    %edx,%edi
+       addl    8(%rsp),%ecx
+       xorl    %eax,%ebp
+       shldl   $5,%edx,%edx
+       vpsrld  $30,%xmm10,%xmm9
+       vpor    %xmm8,%xmm4,%xmm4
+       addl    %esi,%ecx
+       andl    %ebp,%edi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       vpslld  $2,%xmm10,%xmm10
+       vpxor   %xmm9,%xmm4,%xmm4
+       shrdl   $7,%edx,%edx
+       xorl    %eax,%edi
+       movl    %ecx,%esi
+       addl    12(%rsp),%ebx
+       vpxor   %xmm10,%xmm4,%xmm4
+       xorl    %ebp,%edx
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       andl    %edx,%esi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       shrdl   $7,%ecx,%ecx
+       xorl    %ebp,%esi
+       vpalignr        $8,%xmm1,%xmm2,%xmm5
+       movl    %ebx,%edi
+       addl    16(%rsp),%eax
+       vpaddd  %xmm4,%xmm11,%xmm9
+       xorl    %edx,%ecx
+       shldl   $5,%ebx,%ebx
+       vpsrldq $4,%xmm4,%xmm8
+       addl    %esi,%eax
+       andl    %ecx,%edi
+       vpxor   %xmm1,%xmm5,%xmm5
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       vpxor   %xmm3,%xmm8,%xmm8
+       shrdl   $7,%ebx,%ebx
+       xorl    %edx,%edi
+       movl    %eax,%esi
+       addl    20(%rsp),%ebp
+       vpxor   %xmm8,%xmm5,%xmm5
+       xorl    %ecx,%ebx
+       shldl   $5,%eax,%eax
+       vmovdqa %xmm9,0(%rsp)
+       addl    %edi,%ebp
+       andl    %ebx,%esi
+       vpsrld  $31,%xmm5,%xmm8
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       shrdl   $7,%eax,%eax
+       xorl    %ecx,%esi
+       vpslldq $12,%xmm5,%xmm10
+       vpaddd  %xmm5,%xmm5,%xmm5
+       movl    %ebp,%edi
+       addl    24(%rsp),%edx
+       xorl    %ebx,%eax
+       shldl   $5,%ebp,%ebp
+       vpsrld  $30,%xmm10,%xmm9
+       vpor    %xmm8,%xmm5,%xmm5
+       addl    %esi,%edx
+       andl    %eax,%edi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       vpslld  $2,%xmm10,%xmm10
+       vpxor   %xmm9,%xmm5,%xmm5
+       shrdl   $7,%ebp,%ebp
+       xorl    %ebx,%edi
+       movl    %edx,%esi
+       addl    28(%rsp),%ecx
+       vpxor   %xmm10,%xmm5,%xmm5
+       xorl    %eax,%ebp
+       shldl   $5,%edx,%edx
+       vmovdqa -32(%r14),%xmm11
+       addl    %edi,%ecx
+       andl    %ebp,%esi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       shrdl   $7,%edx,%edx
+       xorl    %eax,%esi
+       vpalignr        $8,%xmm2,%xmm3,%xmm6
+       movl    %ecx,%edi
+       addl    32(%rsp),%ebx
+       vpaddd  %xmm5,%xmm11,%xmm9
+       xorl    %ebp,%edx
+       shldl   $5,%ecx,%ecx
+       vpsrldq $4,%xmm5,%xmm8
+       addl    %esi,%ebx
+       andl    %edx,%edi
+       vpxor   %xmm2,%xmm6,%xmm6
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       vpxor   %xmm4,%xmm8,%xmm8
+       shrdl   $7,%ecx,%ecx
+       xorl    %ebp,%edi
+       movl    %ebx,%esi
+       addl    36(%rsp),%eax
+       vpxor   %xmm8,%xmm6,%xmm6
+       xorl    %edx,%ecx
+       shldl   $5,%ebx,%ebx
+       vmovdqa %xmm9,16(%rsp)
+       addl    %edi,%eax
+       andl    %ecx,%esi
+       vpsrld  $31,%xmm6,%xmm8
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       shrdl   $7,%ebx,%ebx
+       xorl    %edx,%esi
+       vpslldq $12,%xmm6,%xmm10
+       vpaddd  %xmm6,%xmm6,%xmm6
+       movl    %eax,%edi
+       addl    40(%rsp),%ebp
+       xorl    %ecx,%ebx
+       shldl   $5,%eax,%eax
+       vpsrld  $30,%xmm10,%xmm9
+       vpor    %xmm8,%xmm6,%xmm6
+       addl    %esi,%ebp
+       andl    %ebx,%edi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       vpslld  $2,%xmm10,%xmm10
+       vpxor   %xmm9,%xmm6,%xmm6
+       shrdl   $7,%eax,%eax
+       xorl    %ecx,%edi
+       movl    %ebp,%esi
+       addl    44(%rsp),%edx
+       vpxor   %xmm10,%xmm6,%xmm6
+       xorl    %ebx,%eax
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       andl    %eax,%esi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       shrdl   $7,%ebp,%ebp
+       xorl    %ebx,%esi
+       vpalignr        $8,%xmm3,%xmm4,%xmm7
+       movl    %edx,%edi
+       addl    48(%rsp),%ecx
+       vpaddd  %xmm6,%xmm11,%xmm9
+       xorl    %eax,%ebp
+       shldl   $5,%edx,%edx
+       vpsrldq $4,%xmm6,%xmm8
+       addl    %esi,%ecx
+       andl    %ebp,%edi
+       vpxor   %xmm3,%xmm7,%xmm7
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       vpxor   %xmm5,%xmm8,%xmm8
+       shrdl   $7,%edx,%edx
+       xorl    %eax,%edi
+       movl    %ecx,%esi
+       addl    52(%rsp),%ebx
+       vpxor   %xmm8,%xmm7,%xmm7
+       xorl    %ebp,%edx
+       shldl   $5,%ecx,%ecx
+       vmovdqa %xmm9,32(%rsp)
+       addl    %edi,%ebx
+       andl    %edx,%esi
+       vpsrld  $31,%xmm7,%xmm8
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       shrdl   $7,%ecx,%ecx
+       xorl    %ebp,%esi
+       vpslldq $12,%xmm7,%xmm10
+       vpaddd  %xmm7,%xmm7,%xmm7
+       movl    %ebx,%edi
+       addl    56(%rsp),%eax
+       xorl    %edx,%ecx
+       shldl   $5,%ebx,%ebx
+       vpsrld  $30,%xmm10,%xmm9
+       vpor    %xmm8,%xmm7,%xmm7
+       addl    %esi,%eax
+       andl    %ecx,%edi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       vpslld  $2,%xmm10,%xmm10
+       vpxor   %xmm9,%xmm7,%xmm7
+       shrdl   $7,%ebx,%ebx
+       xorl    %edx,%edi
+       movl    %eax,%esi
+       addl    60(%rsp),%ebp
+       vpxor   %xmm10,%xmm7,%xmm7
+       xorl    %ecx,%ebx
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       andl    %ebx,%esi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       vpalignr        $8,%xmm6,%xmm7,%xmm8
+       vpxor   %xmm4,%xmm0,%xmm0
+       shrdl   $7,%eax,%eax
+       xorl    %ecx,%esi
+       movl    %ebp,%edi
+       addl    0(%rsp),%edx
+       vpxor   %xmm1,%xmm0,%xmm0
+       xorl    %ebx,%eax
+       shldl   $5,%ebp,%ebp
+       vpaddd  %xmm7,%xmm11,%xmm9
+       addl    %esi,%edx
+       andl    %eax,%edi
+       vpxor   %xmm8,%xmm0,%xmm0
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       shrdl   $7,%ebp,%ebp
+       xorl    %ebx,%edi
+       vpsrld  $30,%xmm0,%xmm8
+       vmovdqa %xmm9,48(%rsp)
+       movl    %edx,%esi
+       addl    4(%rsp),%ecx
+       xorl    %eax,%ebp
+       shldl   $5,%edx,%edx
+       vpslld  $2,%xmm0,%xmm0
+       addl    %edi,%ecx
+       andl    %ebp,%esi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       shrdl   $7,%edx,%edx
+       xorl    %eax,%esi
+       movl    %ecx,%edi
+       addl    8(%rsp),%ebx
+       vpor    %xmm8,%xmm0,%xmm0
+       xorl    %ebp,%edx
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       andl    %edx,%edi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       addl    12(%rsp),%eax
+       xorl    %ebp,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vpalignr        $8,%xmm7,%xmm0,%xmm8
+       vpxor   %xmm5,%xmm1,%xmm1
+       addl    16(%rsp),%ebp
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       vpxor   %xmm2,%xmm1,%xmm1
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       vpaddd  %xmm0,%xmm11,%xmm9
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpxor   %xmm8,%xmm1,%xmm1
+       addl    20(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       vpsrld  $30,%xmm1,%xmm8
+       vmovdqa %xmm9,0(%rsp)
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpslld  $2,%xmm1,%xmm1
+       addl    24(%rsp),%ecx
+       xorl    %eax,%esi
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       xorl    %eax,%edi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpor    %xmm8,%xmm1,%xmm1
+       addl    28(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vpalignr        $8,%xmm0,%xmm1,%xmm8
+       vpxor   %xmm6,%xmm2,%xmm2
+       addl    32(%rsp),%eax
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       vpxor   %xmm3,%xmm2,%xmm2
+       addl    %esi,%eax
+       xorl    %edx,%edi
+       vpaddd  %xmm1,%xmm11,%xmm9
+       vmovdqa 0(%r14),%xmm11
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vpxor   %xmm8,%xmm2,%xmm2
+       addl    36(%rsp),%ebp
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       vpsrld  $30,%xmm2,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpslld  $2,%xmm2,%xmm2
+       addl    40(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpor    %xmm8,%xmm2,%xmm2
+       addl    44(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpalignr        $8,%xmm1,%xmm2,%xmm8
+       vpxor   %xmm7,%xmm3,%xmm3
+       addl    48(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       vpxor   %xmm4,%xmm3,%xmm3
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       vpaddd  %xmm2,%xmm11,%xmm9
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vpxor   %xmm8,%xmm3,%xmm3
+       addl    52(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       vpsrld  $30,%xmm3,%xmm8
+       vmovdqa %xmm9,32(%rsp)
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vpslld  $2,%xmm3,%xmm3
+       addl    56(%rsp),%ebp
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpor    %xmm8,%xmm3,%xmm3
+       addl    60(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpalignr        $8,%xmm2,%xmm3,%xmm8
+       vpxor   %xmm0,%xmm4,%xmm4
+       addl    0(%rsp),%ecx
+       xorl    %eax,%esi
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       vpxor   %xmm5,%xmm4,%xmm4
+       addl    %esi,%ecx
+       xorl    %eax,%edi
+       vpaddd  %xmm3,%xmm11,%xmm9
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpxor   %xmm8,%xmm4,%xmm4
+       addl    4(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       vpsrld  $30,%xmm4,%xmm8
+       vmovdqa %xmm9,48(%rsp)
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vpslld  $2,%xmm4,%xmm4
+       addl    8(%rsp),%eax
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       xorl    %edx,%edi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vpor    %xmm8,%xmm4,%xmm4
+       addl    12(%rsp),%ebp
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpalignr        $8,%xmm3,%xmm4,%xmm8
+       vpxor   %xmm1,%xmm5,%xmm5
+       addl    16(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       vpxor   %xmm6,%xmm5,%xmm5
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       vpaddd  %xmm4,%xmm11,%xmm9
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpxor   %xmm8,%xmm5,%xmm5
+       addl    20(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       vpsrld  $30,%xmm5,%xmm8
+       vmovdqa %xmm9,0(%rsp)
+       addl    %edi,%ecx
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpslld  $2,%xmm5,%xmm5
+       addl    24(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vpor    %xmm8,%xmm5,%xmm5
+       addl    28(%rsp),%eax
+       shrdl   $7,%ecx,%ecx
+       movl    %ebx,%esi
+       xorl    %edx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %ecx,%esi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       vpalignr        $8,%xmm4,%xmm5,%xmm8
+       vpxor   %xmm2,%xmm6,%xmm6
+       addl    32(%rsp),%ebp
+       andl    %ecx,%esi
+       xorl    %edx,%ecx
+       shrdl   $7,%ebx,%ebx
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %eax,%edi
+       xorl    %ecx,%esi
+       vpaddd  %xmm5,%xmm11,%xmm9
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       vpxor   %xmm8,%xmm6,%xmm6
+       xorl    %ebx,%edi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       addl    36(%rsp),%edx
+       vpsrld  $30,%xmm6,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       andl    %ebx,%edi
+       xorl    %ecx,%ebx
+       shrdl   $7,%eax,%eax
+       movl    %ebp,%esi
+       vpslld  $2,%xmm6,%xmm6
+       xorl    %ebx,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %eax,%esi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       addl    40(%rsp),%ecx
+       andl    %eax,%esi
+       vpor    %xmm8,%xmm6,%xmm6
+       xorl    %ebx,%eax
+       shrdl   $7,%ebp,%ebp
+       movl    %edx,%edi
+       xorl    %eax,%esi
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       xorl    %ebp,%edi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       addl    44(%rsp),%ebx
+       andl    %ebp,%edi
+       xorl    %eax,%ebp
+       shrdl   $7,%edx,%edx
+       movl    %ecx,%esi
+       xorl    %ebp,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %edx,%esi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       vpalignr        $8,%xmm5,%xmm6,%xmm8
+       vpxor   %xmm3,%xmm7,%xmm7
+       addl    48(%rsp),%eax
+       andl    %edx,%esi
+       xorl    %ebp,%edx
+       shrdl   $7,%ecx,%ecx
+       vpxor   %xmm0,%xmm7,%xmm7
+       movl    %ebx,%edi
+       xorl    %edx,%esi
+       vpaddd  %xmm6,%xmm11,%xmm9
+       vmovdqa 32(%r14),%xmm11
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       vpxor   %xmm8,%xmm7,%xmm7
+       xorl    %ecx,%edi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       addl    52(%rsp),%ebp
+       vpsrld  $30,%xmm7,%xmm8
+       vmovdqa %xmm9,32(%rsp)
+       andl    %ecx,%edi
+       xorl    %edx,%ecx
+       shrdl   $7,%ebx,%ebx
+       movl    %eax,%esi
+       vpslld  $2,%xmm7,%xmm7
+       xorl    %ecx,%edi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ebx,%esi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       addl    56(%rsp),%edx
+       andl    %ebx,%esi
+       vpor    %xmm8,%xmm7,%xmm7
+       xorl    %ecx,%ebx
+       shrdl   $7,%eax,%eax
+       movl    %ebp,%edi
+       xorl    %ebx,%esi
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       xorl    %eax,%edi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       addl    60(%rsp),%ecx
+       andl    %eax,%edi
+       xorl    %ebx,%eax
+       shrdl   $7,%ebp,%ebp
+       movl    %edx,%esi
+       xorl    %eax,%edi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %ebp,%esi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       vpalignr        $8,%xmm6,%xmm7,%xmm8
+       vpxor   %xmm4,%xmm0,%xmm0
+       addl    0(%rsp),%ebx
+       andl    %ebp,%esi
+       xorl    %eax,%ebp
+       shrdl   $7,%edx,%edx
+       vpxor   %xmm1,%xmm0,%xmm0
+       movl    %ecx,%edi
+       xorl    %ebp,%esi
+       vpaddd  %xmm7,%xmm11,%xmm9
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       vpxor   %xmm8,%xmm0,%xmm0
+       xorl    %edx,%edi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       addl    4(%rsp),%eax
+       vpsrld  $30,%xmm0,%xmm8
+       vmovdqa %xmm9,48(%rsp)
+       andl    %edx,%edi
+       xorl    %ebp,%edx
+       shrdl   $7,%ecx,%ecx
+       movl    %ebx,%esi
+       vpslld  $2,%xmm0,%xmm0
+       xorl    %edx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %ecx,%esi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       addl    8(%rsp),%ebp
+       andl    %ecx,%esi
+       vpor    %xmm8,%xmm0,%xmm0
+       xorl    %edx,%ecx
+       shrdl   $7,%ebx,%ebx
+       movl    %eax,%edi
+       xorl    %ecx,%esi
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       xorl    %ebx,%edi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       addl    12(%rsp),%edx
+       andl    %ebx,%edi
+       xorl    %ecx,%ebx
+       shrdl   $7,%eax,%eax
+       movl    %ebp,%esi
+       xorl    %ebx,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %eax,%esi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       vpalignr        $8,%xmm7,%xmm0,%xmm8
+       vpxor   %xmm5,%xmm1,%xmm1
+       addl    16(%rsp),%ecx
+       andl    %eax,%esi
+       xorl    %ebx,%eax
+       shrdl   $7,%ebp,%ebp
+       vpxor   %xmm2,%xmm1,%xmm1
+       movl    %edx,%edi
+       xorl    %eax,%esi
+       vpaddd  %xmm0,%xmm11,%xmm9
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       vpxor   %xmm8,%xmm1,%xmm1
+       xorl    %ebp,%edi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       addl    20(%rsp),%ebx
+       vpsrld  $30,%xmm1,%xmm8
+       vmovdqa %xmm9,0(%rsp)
+       andl    %ebp,%edi
+       xorl    %eax,%ebp
+       shrdl   $7,%edx,%edx
+       movl    %ecx,%esi
+       vpslld  $2,%xmm1,%xmm1
+       xorl    %ebp,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %edx,%esi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       addl    24(%rsp),%eax
+       andl    %edx,%esi
+       vpor    %xmm8,%xmm1,%xmm1
+       xorl    %ebp,%edx
+       shrdl   $7,%ecx,%ecx
+       movl    %ebx,%edi
+       xorl    %edx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       xorl    %ecx,%edi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       addl    28(%rsp),%ebp
+       andl    %ecx,%edi
+       xorl    %edx,%ecx
+       shrdl   $7,%ebx,%ebx
+       movl    %eax,%esi
+       xorl    %ecx,%edi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ebx,%esi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       vpalignr        $8,%xmm0,%xmm1,%xmm8
+       vpxor   %xmm6,%xmm2,%xmm2
+       addl    32(%rsp),%edx
+       andl    %ebx,%esi
+       xorl    %ecx,%ebx
+       shrdl   $7,%eax,%eax
+       vpxor   %xmm3,%xmm2,%xmm2
+       movl    %ebp,%edi
+       xorl    %ebx,%esi
+       vpaddd  %xmm1,%xmm11,%xmm9
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       vpxor   %xmm8,%xmm2,%xmm2
+       xorl    %eax,%edi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       addl    36(%rsp),%ecx
+       vpsrld  $30,%xmm2,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       andl    %eax,%edi
+       xorl    %ebx,%eax
+       shrdl   $7,%ebp,%ebp
+       movl    %edx,%esi
+       vpslld  $2,%xmm2,%xmm2
+       xorl    %eax,%edi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %ebp,%esi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       addl    40(%rsp),%ebx
+       andl    %ebp,%esi
+       vpor    %xmm8,%xmm2,%xmm2
+       xorl    %eax,%ebp
+       shrdl   $7,%edx,%edx
+       movl    %ecx,%edi
+       xorl    %ebp,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %edx,%edi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       addl    44(%rsp),%eax
+       andl    %edx,%edi
+       xorl    %ebp,%edx
+       shrdl   $7,%ecx,%ecx
+       movl    %ebx,%esi
+       xorl    %edx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       addl    %ebx,%eax
+       vpalignr        $8,%xmm1,%xmm2,%xmm8
+       vpxor   %xmm7,%xmm3,%xmm3
+       addl    48(%rsp),%ebp
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       vpxor   %xmm4,%xmm3,%xmm3
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       vpaddd  %xmm2,%xmm11,%xmm9
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpxor   %xmm8,%xmm3,%xmm3
+       addl    52(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       vpsrld  $30,%xmm3,%xmm8
+       vmovdqa %xmm9,32(%rsp)
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpslld  $2,%xmm3,%xmm3
+       addl    56(%rsp),%ecx
+       xorl    %eax,%esi
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       xorl    %eax,%edi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpor    %xmm8,%xmm3,%xmm3
+       addl    60(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    0(%rsp),%eax
+       vpaddd  %xmm3,%xmm11,%xmm9
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       vmovdqa %xmm9,48(%rsp)
+       xorl    %edx,%edi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    4(%rsp),%ebp
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    8(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       addl    12(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       cmpq    %r10,%r9
+       je      .Ldone_avx
+       vmovdqa 64(%r14),%xmm6
+       vmovdqa -64(%r14),%xmm11
+       vmovdqu 0(%r9),%xmm0
+       vmovdqu 16(%r9),%xmm1
+       vmovdqu 32(%r9),%xmm2
+       vmovdqu 48(%r9),%xmm3
+       vpshufb %xmm6,%xmm0,%xmm0
+       addq    $64,%r9
+       addl    16(%rsp),%ebx
+       xorl    %ebp,%esi
+       vpshufb %xmm6,%xmm1,%xmm1
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       vpaddd  %xmm11,%xmm0,%xmm4
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vmovdqa %xmm4,0(%rsp)
+       addl    20(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    24(%rsp),%ebp
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    28(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       addl    32(%rsp),%ecx
+       xorl    %eax,%esi
+       vpshufb %xmm6,%xmm2,%xmm2
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       vpaddd  %xmm11,%xmm1,%xmm5
+       addl    %esi,%ecx
+       xorl    %eax,%edi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vmovdqa %xmm5,16(%rsp)
+       addl    36(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    40(%rsp),%eax
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       xorl    %edx,%edi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    44(%rsp),%ebp
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    48(%rsp),%edx
+       xorl    %ebx,%esi
+       vpshufb %xmm6,%xmm3,%xmm3
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       vpaddd  %xmm11,%xmm2,%xmm6
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vmovdqa %xmm6,32(%rsp)
+       addl    52(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       addl    56(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    60(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    0(%r8),%eax
+       addl    4(%r8),%esi
+       addl    8(%r8),%ecx
+       addl    12(%r8),%edx
+       movl    %eax,0(%r8)
+       addl    16(%r8),%ebp
+       movl    %esi,4(%r8)
+       movl    %esi,%ebx
+       movl    %ecx,8(%r8)
+       movl    %ecx,%edi
+       movl    %edx,12(%r8)
+       xorl    %edx,%edi
+       movl    %ebp,16(%r8)
+       andl    %edi,%esi
+       jmp     .Loop_avx
+
+.align 16
+.Ldone_avx:
+       addl    16(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    20(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    24(%rsp),%ebp
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    28(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       addl    32(%rsp),%ecx
+       xorl    %eax,%esi
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       xorl    %eax,%edi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       addl    36(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    40(%rsp),%eax
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       xorl    %edx,%edi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    44(%rsp),%ebp
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    48(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       addl    52(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       addl    56(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    60(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vzeroupper
+
+       addl    0(%r8),%eax
+       addl    4(%r8),%esi
+       addl    8(%r8),%ecx
+       movl    %eax,0(%r8)
+       addl    12(%r8),%edx
+       movl    %esi,4(%r8)
+       addl    16(%r8),%ebp
+       movl    %ecx,8(%r8)
+       movl    %edx,12(%r8)
+       movl    %ebp,16(%r8)
+       movq    -40(%r11),%r14
+.cfi_restore   %r14
+       movq    -32(%r11),%r13
+.cfi_restore   %r13
+       movq    -24(%r11),%r12
+.cfi_restore   %r12
+       movq    -16(%r11),%rbp
+.cfi_restore   %rbp
+       movq    -8(%r11),%rbx
+.cfi_restore   %rbx
+       leaq    (%r11),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue_avx:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  sha1_block_data_order_avx,.-sha1_block_data_order_avx
+.type  sha1_block_data_order_avx2,@function
+.align 16
+sha1_block_data_order_avx2:
+_avx2_shortcut:
+.cfi_startproc 
+       movq    %rsp,%r11
+.cfi_def_cfa_register  %r11
+       pushq   %rbx
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_offset    %r14,-48
+       vzeroupper
+       movq    %rdi,%r8
+       movq    %rsi,%r9
+       movq    %rdx,%r10
+
+       leaq    -640(%rsp),%rsp
+       shlq    $6,%r10
+       leaq    64(%r9),%r13
+       andq    $-128,%rsp
+       addq    %r9,%r10
+       leaq    K_XX_XX+64(%rip),%r14
+
+       movl    0(%r8),%eax
+       cmpq    %r10,%r13
+       cmovaeq %r9,%r13
+       movl    4(%r8),%ebp
+       movl    8(%r8),%ecx
+       movl    12(%r8),%edx
+       movl    16(%r8),%esi
+       vmovdqu 64(%r14),%ymm6
+
+       vmovdqu (%r9),%xmm0
+       vmovdqu 16(%r9),%xmm1
+       vmovdqu 32(%r9),%xmm2
+       vmovdqu 48(%r9),%xmm3
+       leaq    64(%r9),%r9
+       vinserti128     $1,(%r13),%ymm0,%ymm0
+       vinserti128     $1,16(%r13),%ymm1,%ymm1
+       vpshufb %ymm6,%ymm0,%ymm0
+       vinserti128     $1,32(%r13),%ymm2,%ymm2
+       vpshufb %ymm6,%ymm1,%ymm1
+       vinserti128     $1,48(%r13),%ymm3,%ymm3
+       vpshufb %ymm6,%ymm2,%ymm2
+       vmovdqu -64(%r14),%ymm11
+       vpshufb %ymm6,%ymm3,%ymm3
+
+       vpaddd  %ymm11,%ymm0,%ymm4
+       vpaddd  %ymm11,%ymm1,%ymm5
+       vmovdqu %ymm4,0(%rsp)
+       vpaddd  %ymm11,%ymm2,%ymm6
+       vmovdqu %ymm5,32(%rsp)
+       vpaddd  %ymm11,%ymm3,%ymm7
+       vmovdqu %ymm6,64(%rsp)
+       vmovdqu %ymm7,96(%rsp)
+       vpalignr        $8,%ymm0,%ymm1,%ymm4
+       vpsrldq $4,%ymm3,%ymm8
+       vpxor   %ymm0,%ymm4,%ymm4
+       vpxor   %ymm2,%ymm8,%ymm8
+       vpxor   %ymm8,%ymm4,%ymm4
+       vpsrld  $31,%ymm4,%ymm8
+       vpslldq $12,%ymm4,%ymm10
+       vpaddd  %ymm4,%ymm4,%ymm4
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm4,%ymm4
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm4,%ymm4
+       vpxor   %ymm10,%ymm4,%ymm4
+       vpaddd  %ymm11,%ymm4,%ymm9
+       vmovdqu %ymm9,128(%rsp)
+       vpalignr        $8,%ymm1,%ymm2,%ymm5
+       vpsrldq $4,%ymm4,%ymm8
+       vpxor   %ymm1,%ymm5,%ymm5
+       vpxor   %ymm3,%ymm8,%ymm8
+       vpxor   %ymm8,%ymm5,%ymm5
+       vpsrld  $31,%ymm5,%ymm8
+       vmovdqu -32(%r14),%ymm11
+       vpslldq $12,%ymm5,%ymm10
+       vpaddd  %ymm5,%ymm5,%ymm5
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm5,%ymm5
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm5,%ymm5
+       vpxor   %ymm10,%ymm5,%ymm5
+       vpaddd  %ymm11,%ymm5,%ymm9
+       vmovdqu %ymm9,160(%rsp)
+       vpalignr        $8,%ymm2,%ymm3,%ymm6
+       vpsrldq $4,%ymm5,%ymm8
+       vpxor   %ymm2,%ymm6,%ymm6
+       vpxor   %ymm4,%ymm8,%ymm8
+       vpxor   %ymm8,%ymm6,%ymm6
+       vpsrld  $31,%ymm6,%ymm8
+       vpslldq $12,%ymm6,%ymm10
+       vpaddd  %ymm6,%ymm6,%ymm6
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm6,%ymm6
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm6,%ymm6
+       vpxor   %ymm10,%ymm6,%ymm6
+       vpaddd  %ymm11,%ymm6,%ymm9
+       vmovdqu %ymm9,192(%rsp)
+       vpalignr        $8,%ymm3,%ymm4,%ymm7
+       vpsrldq $4,%ymm6,%ymm8
+       vpxor   %ymm3,%ymm7,%ymm7
+       vpxor   %ymm5,%ymm8,%ymm8
+       vpxor   %ymm8,%ymm7,%ymm7
+       vpsrld  $31,%ymm7,%ymm8
+       vpslldq $12,%ymm7,%ymm10
+       vpaddd  %ymm7,%ymm7,%ymm7
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm7,%ymm7
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm7,%ymm7
+       vpxor   %ymm10,%ymm7,%ymm7
+       vpaddd  %ymm11,%ymm7,%ymm9
+       vmovdqu %ymm9,224(%rsp)
+       leaq    128(%rsp),%r13
+       jmp     .Loop_avx2
+.align 32
+.Loop_avx2:
+       rorxl   $2,%ebp,%ebx
+       andnl   %edx,%ebp,%edi
+       andl    %ecx,%ebp
+       xorl    %edi,%ebp
+       jmp     .Lalign32_1
+.align 32
+.Lalign32_1:
+       vpalignr        $8,%ymm6,%ymm7,%ymm8
+       vpxor   %ymm4,%ymm0,%ymm0
+       addl    -128(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       vpxor   %ymm1,%ymm0,%ymm0
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       vpxor   %ymm8,%ymm0,%ymm0
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       vpsrld  $30,%ymm0,%ymm8
+       vpslld  $2,%ymm0,%ymm0
+       addl    -124(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       vpor    %ymm8,%ymm0,%ymm0
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -120(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       vpaddd  %ymm11,%ymm0,%ymm9
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       vmovdqu %ymm9,256(%rsp)
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       addl    -116(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       addl    -96(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       andl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       vpalignr        $8,%ymm7,%ymm0,%ymm8
+       vpxor   %ymm5,%ymm1,%ymm1
+       addl    -92(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       vpxor   %ymm2,%ymm1,%ymm1
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       vpxor   %ymm8,%ymm1,%ymm1
+       andl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       vpsrld  $30,%ymm1,%ymm8
+       vpslld  $2,%ymm1,%ymm1
+       addl    -88(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       vpor    %ymm8,%ymm1,%ymm1
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    -84(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       vpaddd  %ymm11,%ymm1,%ymm9
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       vmovdqu %ymm9,288(%rsp)
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -64(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       addl    -60(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       vpalignr        $8,%ymm0,%ymm1,%ymm8
+       vpxor   %ymm6,%ymm2,%ymm2
+       addl    -56(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       vpxor   %ymm3,%ymm2,%ymm2
+       vmovdqu 0(%r14),%ymm11
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       vpxor   %ymm8,%ymm2,%ymm2
+       andl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       vpsrld  $30,%ymm2,%ymm8
+       vpslld  $2,%ymm2,%ymm2
+       addl    -52(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       andl    %ecx,%ebp
+       vpor    %ymm8,%ymm2,%ymm2
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       addl    -32(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       vpaddd  %ymm11,%ymm2,%ymm9
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       vmovdqu %ymm9,320(%rsp)
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    -28(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -24(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       vpalignr        $8,%ymm1,%ymm2,%ymm8
+       vpxor   %ymm7,%ymm3,%ymm3
+       addl    -20(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       vpxor   %ymm4,%ymm3,%ymm3
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       vpxor   %ymm8,%ymm3,%ymm3
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       vpsrld  $30,%ymm3,%ymm8
+       vpslld  $2,%ymm3,%ymm3
+       addl    0(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       andl    %edx,%ebx
+       vpor    %ymm8,%ymm3,%ymm3
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       addl    4(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       vpaddd  %ymm11,%ymm3,%ymm9
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       andl    %ecx,%ebp
+       vmovdqu %ymm9,352(%rsp)
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       addl    8(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    12(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       vpalignr        $8,%ymm2,%ymm3,%ymm8
+       vpxor   %ymm0,%ymm4,%ymm4
+       addl    32(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       vpxor   %ymm8,%ymm4,%ymm4
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    36(%r13),%ebx
+       vpsrld  $30,%ymm4,%ymm8
+       vpslld  $2,%ymm4,%ymm4
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       vpor    %ymm8,%ymm4,%ymm4
+       addl    40(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       vpaddd  %ymm11,%ymm4,%ymm9
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    44(%r13),%eax
+       vmovdqu %ymm9,384(%rsp)
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    64(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       vpalignr        $8,%ymm3,%ymm4,%ymm8
+       vpxor   %ymm1,%ymm5,%ymm5
+       addl    68(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       vpxor   %ymm6,%ymm5,%ymm5
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       vpxor   %ymm8,%ymm5,%ymm5
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    72(%r13),%ecx
+       vpsrld  $30,%ymm5,%ymm8
+       vpslld  $2,%ymm5,%ymm5
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       vpor    %ymm8,%ymm5,%ymm5
+       addl    76(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       vpaddd  %ymm11,%ymm5,%ymm9
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    96(%r13),%ebp
+       vmovdqu %ymm9,416(%rsp)
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    100(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       vpalignr        $8,%ymm4,%ymm5,%ymm8
+       vpxor   %ymm2,%ymm6,%ymm6
+       addl    104(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       vpxor   %ymm7,%ymm6,%ymm6
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       vpxor   %ymm8,%ymm6,%ymm6
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    108(%r13),%edx
+       leaq    256(%r13),%r13
+       vpsrld  $30,%ymm6,%ymm8
+       vpslld  $2,%ymm6,%ymm6
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       vpor    %ymm8,%ymm6,%ymm6
+       addl    -128(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       vpaddd  %ymm11,%ymm6,%ymm9
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -124(%r13),%ebx
+       vmovdqu %ymm9,448(%rsp)
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -120(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       vpalignr        $8,%ymm5,%ymm6,%ymm8
+       vpxor   %ymm3,%ymm7,%ymm7
+       addl    -116(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       vpxor   %ymm0,%ymm7,%ymm7
+       vmovdqu 32(%r14),%ymm11
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       vpxor   %ymm8,%ymm7,%ymm7
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -96(%r13),%esi
+       vpsrld  $30,%ymm7,%ymm8
+       vpslld  $2,%ymm7,%ymm7
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       vpor    %ymm8,%ymm7,%ymm7
+       addl    -92(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       vpaddd  %ymm11,%ymm7,%ymm9
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    -88(%r13),%ecx
+       vmovdqu %ymm9,480(%rsp)
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -84(%r13),%ebx
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       jmp     .Lalign32_2
+.align 32
+.Lalign32_2:
+       vpalignr        $8,%ymm6,%ymm7,%ymm8
+       vpxor   %ymm4,%ymm0,%ymm0
+       addl    -64(%r13),%ebp
+       xorl    %esi,%ecx
+       vpxor   %ymm1,%ymm0,%ymm0
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       vpxor   %ymm8,%ymm0,%ymm0
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       vpsrld  $30,%ymm0,%ymm8
+       vpslld  $2,%ymm0,%ymm0
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    -60(%r13),%eax
+       xorl    %edx,%ebx
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       vpor    %ymm8,%ymm0,%ymm0
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       vpaddd  %ymm11,%ymm0,%ymm9
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    -56(%r13),%esi
+       xorl    %ecx,%ebp
+       vmovdqu %ymm9,512(%rsp)
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       addl    -52(%r13),%edx
+       xorl    %ebx,%eax
+       movl    %ebp,%edi
+       xorl    %ebx,%edi
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       andl    %edi,%esi
+       addl    -32(%r13),%ecx
+       xorl    %ebp,%esi
+       movl    %eax,%edi
+       xorl    %ebp,%edi
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       vpalignr        $8,%ymm7,%ymm0,%ymm8
+       vpxor   %ymm5,%ymm1,%ymm1
+       addl    -28(%r13),%ebx
+       xorl    %eax,%edx
+       vpxor   %ymm2,%ymm1,%ymm1
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       vpxor   %ymm8,%ymm1,%ymm1
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       vpsrld  $30,%ymm1,%ymm8
+       vpslld  $2,%ymm1,%ymm1
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    -24(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       vpor    %ymm8,%ymm1,%ymm1
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       vpaddd  %ymm11,%ymm1,%ymm9
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    -20(%r13),%eax
+       xorl    %edx,%ebx
+       vmovdqu %ymm9,544(%rsp)
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    0(%r13),%esi
+       xorl    %ecx,%ebp
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       addl    4(%r13),%edx
+       xorl    %ebx,%eax
+       movl    %ebp,%edi
+       xorl    %ebx,%edi
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       andl    %edi,%esi
+       vpalignr        $8,%ymm0,%ymm1,%ymm8
+       vpxor   %ymm6,%ymm2,%ymm2
+       addl    8(%r13),%ecx
+       xorl    %ebp,%esi
+       vpxor   %ymm3,%ymm2,%ymm2
+       movl    %eax,%edi
+       xorl    %ebp,%edi
+       leal    (%rcx,%rsi,1),%ecx
+       vpxor   %ymm8,%ymm2,%ymm2
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       vpsrld  $30,%ymm2,%ymm8
+       vpslld  $2,%ymm2,%ymm2
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       addl    12(%r13),%ebx
+       xorl    %eax,%edx
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       vpor    %ymm8,%ymm2,%ymm2
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       vpaddd  %ymm11,%ymm2,%ymm9
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    32(%r13),%ebp
+       xorl    %esi,%ecx
+       vmovdqu %ymm9,576(%rsp)
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    36(%r13),%eax
+       xorl    %edx,%ebx
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    40(%r13),%esi
+       xorl    %ecx,%ebp
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       vpalignr        $8,%ymm1,%ymm2,%ymm8
+       vpxor   %ymm7,%ymm3,%ymm3
+       addl    44(%r13),%edx
+       xorl    %ebx,%eax
+       vpxor   %ymm4,%ymm3,%ymm3
+       movl    %ebp,%edi
+       xorl    %ebx,%edi
+       leal    (%rdx,%rax,1),%edx
+       vpxor   %ymm8,%ymm3,%ymm3
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       vpsrld  $30,%ymm3,%ymm8
+       vpslld  $2,%ymm3,%ymm3
+       addl    %r12d,%edx
+       andl    %edi,%esi
+       addl    64(%r13),%ecx
+       xorl    %ebp,%esi
+       movl    %eax,%edi
+       xorl    %ebp,%edi
+       vpor    %ymm8,%ymm3,%ymm3
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       vpaddd  %ymm11,%ymm3,%ymm9
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       addl    68(%r13),%ebx
+       xorl    %eax,%edx
+       vmovdqu %ymm9,608(%rsp)
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    72(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    76(%r13),%eax
+       xorl    %edx,%ebx
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    96(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    100(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    104(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    108(%r13),%ebx
+       leaq    256(%r13),%r13
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -128(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    -124(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -120(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    -116(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    -96(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -92(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -88(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    -84(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -64(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    -60(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    -56(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -52(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -32(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    -28(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -24(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    -20(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       addl    %r12d,%edx
+       leaq    128(%r9),%r13
+       leaq    128(%r9),%rdi
+       cmpq    %r10,%r13
+       cmovaeq %r9,%r13
+
+
+       addl    0(%r8),%edx
+       addl    4(%r8),%esi
+       addl    8(%r8),%ebp
+       movl    %edx,0(%r8)
+       addl    12(%r8),%ebx
+       movl    %esi,4(%r8)
+       movl    %edx,%eax
+       addl    16(%r8),%ecx
+       movl    %ebp,%r12d
+       movl    %ebp,8(%r8)
+       movl    %ebx,%edx
+
+       movl    %ebx,12(%r8)
+       movl    %esi,%ebp
+       movl    %ecx,16(%r8)
+
+       movl    %ecx,%esi
+       movl    %r12d,%ecx
+
+
+       cmpq    %r10,%r9
+       je      .Ldone_avx2
+       vmovdqu 64(%r14),%ymm6
+       cmpq    %r10,%rdi
+       ja      .Last_avx2
+
+       vmovdqu -64(%rdi),%xmm0
+       vmovdqu -48(%rdi),%xmm1
+       vmovdqu -32(%rdi),%xmm2
+       vmovdqu -16(%rdi),%xmm3
+       vinserti128     $1,0(%r13),%ymm0,%ymm0
+       vinserti128     $1,16(%r13),%ymm1,%ymm1
+       vinserti128     $1,32(%r13),%ymm2,%ymm2
+       vinserti128     $1,48(%r13),%ymm3,%ymm3
+       jmp     .Last_avx2
+
+.align 32
+.Last_avx2:
+       leaq    128+16(%rsp),%r13
+       rorxl   $2,%ebp,%ebx
+       andnl   %edx,%ebp,%edi
+       andl    %ecx,%ebp
+       xorl    %edi,%ebp
+       subq    $-128,%r9
+       addl    -128(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    -124(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -120(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       addl    -116(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       addl    -96(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       andl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       addl    -92(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       andl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       addl    -88(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    -84(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -64(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       addl    -60(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       addl    -56(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       andl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       addl    -52(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       andl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       addl    -32(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    -28(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -24(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       addl    -20(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       addl    0(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       andl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       addl    4(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       andl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       addl    8(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    12(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    32(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    36(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    40(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    44(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    64(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       vmovdqu -64(%r14),%ymm11
+       vpshufb %ymm6,%ymm0,%ymm0
+       addl    68(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    72(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    76(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    96(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    100(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       vpshufb %ymm6,%ymm1,%ymm1
+       vpaddd  %ymm11,%ymm0,%ymm8
+       addl    104(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    108(%r13),%edx
+       leaq    256(%r13),%r13
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    -128(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -124(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -120(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       vmovdqu %ymm8,0(%rsp)
+       vpshufb %ymm6,%ymm2,%ymm2
+       vpaddd  %ymm11,%ymm1,%ymm9
+       addl    -116(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -96(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    -92(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    -88(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -84(%r13),%ebx
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       vmovdqu %ymm9,32(%rsp)
+       vpshufb %ymm6,%ymm3,%ymm3
+       vpaddd  %ymm11,%ymm2,%ymm6
+       addl    -64(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    -60(%r13),%eax
+       xorl    %edx,%ebx
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    -56(%r13),%esi
+       xorl    %ecx,%ebp
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       addl    -52(%r13),%edx
+       xorl    %ebx,%eax
+       movl    %ebp,%edi
+       xorl    %ebx,%edi
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       andl    %edi,%esi
+       addl    -32(%r13),%ecx
+       xorl    %ebp,%esi
+       movl    %eax,%edi
+       xorl    %ebp,%edi
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       jmp     .Lalign32_3
+.align 32
+.Lalign32_3:
+       vmovdqu %ymm6,64(%rsp)
+       vpaddd  %ymm11,%ymm3,%ymm7
+       addl    -28(%r13),%ebx
+       xorl    %eax,%edx
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    -24(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    -20(%r13),%eax
+       xorl    %edx,%ebx
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    0(%r13),%esi
+       xorl    %ecx,%ebp
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       addl    4(%r13),%edx
+       xorl    %ebx,%eax
+       movl    %ebp,%edi
+       xorl    %ebx,%edi
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       andl    %edi,%esi
+       vmovdqu %ymm7,96(%rsp)
+       addl    8(%r13),%ecx
+       xorl    %ebp,%esi
+       movl    %eax,%edi
+       xorl    %ebp,%edi
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       addl    12(%r13),%ebx
+       xorl    %eax,%edx
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    32(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    36(%r13),%eax
+       xorl    %edx,%ebx
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    40(%r13),%esi
+       xorl    %ecx,%ebp
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       vpalignr        $8,%ymm0,%ymm1,%ymm4
+       addl    44(%r13),%edx
+       xorl    %ebx,%eax
+       movl    %ebp,%edi
+       xorl    %ebx,%edi
+       vpsrldq $4,%ymm3,%ymm8
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       vpxor   %ymm0,%ymm4,%ymm4
+       vpxor   %ymm2,%ymm8,%ymm8
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       vpxor   %ymm8,%ymm4,%ymm4
+       andl    %edi,%esi
+       addl    64(%r13),%ecx
+       xorl    %ebp,%esi
+       movl    %eax,%edi
+       vpsrld  $31,%ymm4,%ymm8
+       xorl    %ebp,%edi
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       vpslldq $12,%ymm4,%ymm10
+       vpaddd  %ymm4,%ymm4,%ymm4
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm4,%ymm4
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm4,%ymm4
+       addl    68(%r13),%ebx
+       xorl    %eax,%edx
+       vpxor   %ymm10,%ymm4,%ymm4
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       vpaddd  %ymm11,%ymm4,%ymm9
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       vmovdqu %ymm9,128(%rsp)
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    72(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    76(%r13),%eax
+       xorl    %edx,%ebx
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       vpalignr        $8,%ymm1,%ymm2,%ymm5
+       addl    96(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       vpsrldq $4,%ymm4,%ymm8
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       vpxor   %ymm1,%ymm5,%ymm5
+       vpxor   %ymm3,%ymm8,%ymm8
+       addl    100(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       vpxor   %ymm8,%ymm5,%ymm5
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       vpsrld  $31,%ymm5,%ymm8
+       vmovdqu -32(%r14),%ymm11
+       xorl    %ebx,%esi
+       addl    104(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       vpslldq $12,%ymm5,%ymm10
+       vpaddd  %ymm5,%ymm5,%ymm5
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm5,%ymm5
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm5,%ymm5
+       xorl    %ebp,%edx
+       addl    108(%r13),%ebx
+       leaq    256(%r13),%r13
+       vpxor   %ymm10,%ymm5,%ymm5
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       vpaddd  %ymm11,%ymm5,%ymm9
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       vmovdqu %ymm9,160(%rsp)
+       addl    -128(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       vpalignr        $8,%ymm2,%ymm3,%ymm6
+       addl    -124(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       vpsrldq $4,%ymm5,%ymm8
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       vpxor   %ymm2,%ymm6,%ymm6
+       vpxor   %ymm4,%ymm8,%ymm8
+       addl    -120(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       vpxor   %ymm8,%ymm6,%ymm6
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       vpsrld  $31,%ymm6,%ymm8
+       xorl    %ecx,%eax
+       addl    -116(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       vpslldq $12,%ymm6,%ymm10
+       vpaddd  %ymm6,%ymm6,%ymm6
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm6,%ymm6
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm6,%ymm6
+       xorl    %ebx,%esi
+       addl    -96(%r13),%ecx
+       vpxor   %ymm10,%ymm6,%ymm6
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       vpaddd  %ymm11,%ymm6,%ymm9
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       vmovdqu %ymm9,192(%rsp)
+       addl    -92(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       vpalignr        $8,%ymm3,%ymm4,%ymm7
+       addl    -88(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       vpsrldq $4,%ymm6,%ymm8
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       vpxor   %ymm3,%ymm7,%ymm7
+       vpxor   %ymm5,%ymm8,%ymm8
+       addl    -84(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       vpxor   %ymm8,%ymm7,%ymm7
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       vpsrld  $31,%ymm7,%ymm8
+       xorl    %edx,%ebp
+       addl    -64(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       vpslldq $12,%ymm7,%ymm10
+       vpaddd  %ymm7,%ymm7,%ymm7
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm7,%ymm7
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm7,%ymm7
+       xorl    %ecx,%eax
+       addl    -60(%r13),%edx
+       vpxor   %ymm10,%ymm7,%ymm7
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       vpaddd  %ymm11,%ymm7,%ymm9
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       vmovdqu %ymm9,224(%rsp)
+       addl    -56(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -52(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -32(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    -28(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -24(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    -20(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       addl    %r12d,%edx
+       leaq    128(%rsp),%r13
+
+
+       addl    0(%r8),%edx
+       addl    4(%r8),%esi
+       addl    8(%r8),%ebp
+       movl    %edx,0(%r8)
+       addl    12(%r8),%ebx
+       movl    %esi,4(%r8)
+       movl    %edx,%eax
+       addl    16(%r8),%ecx
+       movl    %ebp,%r12d
+       movl    %ebp,8(%r8)
+       movl    %ebx,%edx
+
+       movl    %ebx,12(%r8)
+       movl    %esi,%ebp
+       movl    %ecx,16(%r8)
+
+       movl    %ecx,%esi
+       movl    %r12d,%ecx
+
+
+       cmpq    %r10,%r9
+       jbe     .Loop_avx2
+
+.Ldone_avx2:
+       vzeroupper
+       movq    -40(%r11),%r14
+.cfi_restore   %r14
+       movq    -32(%r11),%r13
+.cfi_restore   %r13
+       movq    -24(%r11),%r12
+.cfi_restore   %r12
+       movq    -16(%r11),%rbp
+.cfi_restore   %rbp
+       movq    -8(%r11),%rbx
+.cfi_restore   %rbx
+       leaq    (%r11),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue_avx2:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
+.align 64
+K_XX_XX:
+.long  0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long  0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long  0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long  0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long  0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long  0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long  0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long  0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte  0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+.byte  83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
+
+.section .note.GNU-stack,"",%progbits
index 7470ef74075973f773460c7bdfc56a4bd2038179..8d9aaa4a815e1be21fa6c11808332f783cf413a4 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,6 @@
 #
 # *** This file is auto-generated ***
 #
-.file  "sha512-586.s"
 .text
 .globl sha256_block_data_order
 .type  sha256_block_data_order,@function
@@ -64,20 +63,6 @@ sha256_block_data_order:
        movl    %edi,4(%esp)
        movl    %eax,8(%esp)
        movl    %ebx,12(%esp)
-       leal    _gnutls_x86_cpuid_s-.L001K256(%ebp),%edx
-       movl    (%edx),%ecx
-       movl    4(%edx),%edx
-       testl   $1048576,%ecx
-       jnz     .L002loop
-       testl   $2048,%edx
-       andl    $1073741824,%ecx
-       andl    $268435456,%edx
-       orl     %edx,%ecx
-       cmpl    $1342177280,%ecx
-       je      .L003loop_shrd
-       subl    %edi,%eax
-       cmpl    $256,%eax
-       jae     .L004unrolled
        jmp     .L002loop
 .align 16
 .L002loop:
@@ -149,7 +134,7 @@ sha256_block_data_order:
        movl    %ecx,28(%esp)
        movl    %edi,32(%esp)
 .align 16
-.L00500_15:
+.L00300_15:
        movl    %edx,%ecx
        movl    24(%esp),%esi
        rorl    $14,%ecx
@@ -187,11 +172,11 @@ sha256_block_data_order:
        addl    $4,%ebp
        addl    %ebx,%eax
        cmpl    $3248222580,%esi
-       jne     .L00500_15
+       jne     .L00300_15
        movl    156(%esp),%ecx
-       jmp     .L00616_63
+       jmp     .L00416_63
 .align 16
-.L00616_63:
+.L00416_63:
        movl    %ecx,%ebx
        movl    104(%esp),%esi
        rorl    $11,%ecx
@@ -246,7 +231,7 @@ sha256_block_data_order:
        addl    $4,%ebp
        addl    %ebx,%eax
        cmpl    $3329325298,%esi
-       jne     .L00616_63
+       jne     .L00416_63
        movl    356(%esp),%esi
        movl    8(%esp),%ebx
        movl    16(%esp),%ecx
@@ -280,8 +265,8 @@ sha256_block_data_order:
        popl    %ebx
        popl    %ebp
        ret
-.align 16
-.L003loop_shrd:
+.align 32
+.L005loop_shrd:
        movl    (%edi),%eax
        movl    4(%edi),%ebx
        movl    8(%edi),%ecx
@@ -350,7 +335,7 @@ sha256_block_data_order:
        movl    %ecx,28(%esp)
        movl    %edi,32(%esp)
 .align 16
-.L00700_15_shrd:
+.L00600_15_shrd:
        movl    %edx,%ecx
        movl    24(%esp),%esi
        shrdl   $14,%ecx,%ecx
@@ -388,11 +373,11 @@ sha256_block_data_order:
        addl    $4,%ebp
        addl    %ebx,%eax
        cmpl    $3248222580,%esi
-       jne     .L00700_15_shrd
+       jne     .L00600_15_shrd
        movl    156(%esp),%ecx
-       jmp     .L00816_63_shrd
+       jmp     .L00716_63_shrd
 .align 16
-.L00816_63_shrd:
+.L00716_63_shrd:
        movl    %ecx,%ebx
        movl    104(%esp),%esi
        shrdl   $11,%ecx,%ecx
@@ -447,7 +432,7 @@ sha256_block_data_order:
        addl    $4,%ebp
        addl    %ebx,%eax
        cmpl    $3329325298,%esi
-       jne     .L00816_63_shrd
+       jne     .L00716_63_shrd
        movl    356(%esp),%esi
        movl    8(%esp),%ebx
        movl    16(%esp),%ecx
@@ -474,7 +459,7 @@ sha256_block_data_order:
        leal    356(%esp),%esp
        subl    $256,%ebp
        cmpl    8(%esp),%edi
-       jb      .L003loop_shrd
+       jb      .L005loop_shrd
        movl    12(%esp),%esp
        popl    %edi
        popl    %esi
@@ -485,8 +470,13 @@ sha256_block_data_order:
 .L001K256:
 .long  1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
 .long  66051,67438087,134810123,202182159
+.byte  83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+.byte  110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte  67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte  112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte  62,0
 .align 16
-.L004unrolled:
+.L008unrolled:
        leal    -96(%esp),%esp
        movl    (%esi),%eax
        movl    4(%esi),%ebp
@@ -3393,14 +3383,5 @@ sha256_block_data_order:
        popl    %ebp
        ret
 .size  sha256_block_data_order,.-.L_sha256_block_data_order_begin
-.byte  83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
-.byte  110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
-.byte  67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
-.byte  112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
-.byte  62,0
-.comm  _gnutls_x86_cpuid_s,16,4
-
 
 .section .note.GNU-stack,"",%progbits
-
-
diff --git a/lib/accelerated/x86/elf/sha256-ssse3-x86_64.s b/lib/accelerated/x86/elf/sha256-ssse3-x86_64.s
new file mode 100644 (file)
index 0000000..4b08e0c
--- /dev/null
@@ -0,0 +1,5471 @@
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 
+#     * Redistributions of source code must retain copyright notices,
+#      this list of conditions and the following disclaimer.
+#
+#     * Redistributions in binary form must reproduce the above
+#      copyright notice, this list of conditions and the following
+#      disclaimer in the documentation and/or other materials
+#      provided with the distribution.
+#
+#     * Neither the name of the Andy Polyakov nor the names of its
+#      copyright holder and contributors may be used to endorse or
+#      promote products derived from this software without specific
+#      prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *** This file is auto-generated ***
+#
+.text  
+
+
+.globl sha256_block_data_order
+.type  sha256_block_data_order,@function
+.align 16
+sha256_block_data_order:
+.cfi_startproc 
+       leaq    _gnutls_x86_cpuid_s(%rip),%r11
+       movl    0(%r11),%r9d
+       movl    4(%r11),%r10d
+       movl    8(%r11),%r11d
+       testl   $536870912,%r11d
+       jnz     _shaext_shortcut
+       andl    $296,%r11d
+       cmpl    $296,%r11d
+       je      .Lavx2_shortcut
+       andl    $1073741824,%r9d
+       andl    $268435968,%r10d
+       orl     %r9d,%r10d
+       cmpl    $1342177792,%r10d
+       je      .Lavx_shortcut
+       testl   $512,%r10d
+       jnz     .Lssse3_shortcut
+       movq    %rsp,%rax
+.cfi_def_cfa_register  %rax
+       pushq   %rbx
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_offset    %r15,-56
+       shlq    $4,%rdx
+       subq    $64+32,%rsp
+       leaq    (%rsi,%rdx,4),%rdx
+       andq    $-64,%rsp
+       movq    %rdi,64+0(%rsp)
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+       movq    %rax,88(%rsp)
+.cfi_escape    0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue:
+
+       movl    0(%rdi),%eax
+       movl    4(%rdi),%ebx
+       movl    8(%rdi),%ecx
+       movl    12(%rdi),%edx
+       movl    16(%rdi),%r8d
+       movl    20(%rdi),%r9d
+       movl    24(%rdi),%r10d
+       movl    28(%rdi),%r11d
+       jmp     .Lloop
+
+.align 16
+.Lloop:
+       movl    %ebx,%edi
+       leaq    K256(%rip),%rbp
+       xorl    %ecx,%edi
+       movl    0(%rsi),%r12d
+       movl    %r8d,%r13d
+       movl    %eax,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r9d,%r15d
+
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r15d
+
+       movl    %r12d,0(%rsp)
+       xorl    %eax,%r14d
+       andl    %r8d,%r15d
+
+       rorl    $5,%r13d
+       addl    %r11d,%r12d
+       xorl    %r10d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r8d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %eax,%r15d
+       addl    (%rbp),%r12d
+       xorl    %eax,%r14d
+
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       movl    %ebx,%r11d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r11d
+       addl    %r12d,%edx
+       addl    %r12d,%r11d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r11d
+       movl    4(%rsi),%r12d
+       movl    %edx,%r13d
+       movl    %r11d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r8d,%edi
+
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%edi
+
+       movl    %r12d,4(%rsp)
+       xorl    %r11d,%r14d
+       andl    %edx,%edi
+
+       rorl    $5,%r13d
+       addl    %r10d,%r12d
+       xorl    %r9d,%edi
+
+       rorl    $11,%r14d
+       xorl    %edx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r11d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r11d,%r14d
+
+       xorl    %eax,%edi
+       rorl    $6,%r13d
+       movl    %eax,%r10d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r10d
+       addl    %r12d,%ecx
+       addl    %r12d,%r10d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r10d
+       movl    8(%rsi),%r12d
+       movl    %ecx,%r13d
+       movl    %r10d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %edx,%r15d
+
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r15d
+
+       movl    %r12d,8(%rsp)
+       xorl    %r10d,%r14d
+       andl    %ecx,%r15d
+
+       rorl    $5,%r13d
+       addl    %r9d,%r12d
+       xorl    %r8d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %ecx,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r10d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r10d,%r14d
+
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       movl    %r11d,%r9d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r9d
+       addl    %r12d,%ebx
+       addl    %r12d,%r9d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r9d
+       movl    12(%rsi),%r12d
+       movl    %ebx,%r13d
+       movl    %r9d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %ecx,%edi
+
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%edi
+
+       movl    %r12d,12(%rsp)
+       xorl    %r9d,%r14d
+       andl    %ebx,%edi
+
+       rorl    $5,%r13d
+       addl    %r8d,%r12d
+       xorl    %edx,%edi
+
+       rorl    $11,%r14d
+       xorl    %ebx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r9d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r9d,%r14d
+
+       xorl    %r10d,%edi
+       rorl    $6,%r13d
+       movl    %r10d,%r8d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r8d
+       addl    %r12d,%eax
+       addl    %r12d,%r8d
+
+       leaq    20(%rbp),%rbp
+       addl    %r14d,%r8d
+       movl    16(%rsi),%r12d
+       movl    %eax,%r13d
+       movl    %r8d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %ebx,%r15d
+
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r15d
+
+       movl    %r12d,16(%rsp)
+       xorl    %r8d,%r14d
+       andl    %eax,%r15d
+
+       rorl    $5,%r13d
+       addl    %edx,%r12d
+       xorl    %ecx,%r15d
+
+       rorl    $11,%r14d
+       xorl    %eax,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r8d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r8d,%r14d
+
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       movl    %r9d,%edx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%edx
+       addl    %r12d,%r11d
+       addl    %r12d,%edx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%edx
+       movl    20(%rsi),%r12d
+       movl    %r11d,%r13d
+       movl    %edx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %eax,%edi
+
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%edi
+
+       movl    %r12d,20(%rsp)
+       xorl    %edx,%r14d
+       andl    %r11d,%edi
+
+       rorl    $5,%r13d
+       addl    %ecx,%r12d
+       xorl    %ebx,%edi
+
+       rorl    $11,%r14d
+       xorl    %r11d,%r13d
+       addl    %edi,%r12d
+
+       movl    %edx,%edi
+       addl    (%rbp),%r12d
+       xorl    %edx,%r14d
+
+       xorl    %r8d,%edi
+       rorl    $6,%r13d
+       movl    %r8d,%ecx
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%ecx
+       addl    %r12d,%r10d
+       addl    %r12d,%ecx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%ecx
+       movl    24(%rsi),%r12d
+       movl    %r10d,%r13d
+       movl    %ecx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r11d,%r15d
+
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r15d
+
+       movl    %r12d,24(%rsp)
+       xorl    %ecx,%r14d
+       andl    %r10d,%r15d
+
+       rorl    $5,%r13d
+       addl    %ebx,%r12d
+       xorl    %eax,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r10d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %ecx,%r15d
+       addl    (%rbp),%r12d
+       xorl    %ecx,%r14d
+
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       movl    %edx,%ebx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%ebx
+       addl    %r12d,%r9d
+       addl    %r12d,%ebx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%ebx
+       movl    28(%rsi),%r12d
+       movl    %r9d,%r13d
+       movl    %ebx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r10d,%edi
+
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%edi
+
+       movl    %r12d,28(%rsp)
+       xorl    %ebx,%r14d
+       andl    %r9d,%edi
+
+       rorl    $5,%r13d
+       addl    %eax,%r12d
+       xorl    %r11d,%edi
+
+       rorl    $11,%r14d
+       xorl    %r9d,%r13d
+       addl    %edi,%r12d
+
+       movl    %ebx,%edi
+       addl    (%rbp),%r12d
+       xorl    %ebx,%r14d
+
+       xorl    %ecx,%edi
+       rorl    $6,%r13d
+       movl    %ecx,%eax
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%eax
+       addl    %r12d,%r8d
+       addl    %r12d,%eax
+
+       leaq    20(%rbp),%rbp
+       addl    %r14d,%eax
+       movl    32(%rsi),%r12d
+       movl    %r8d,%r13d
+       movl    %eax,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r9d,%r15d
+
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r15d
+
+       movl    %r12d,32(%rsp)
+       xorl    %eax,%r14d
+       andl    %r8d,%r15d
+
+       rorl    $5,%r13d
+       addl    %r11d,%r12d
+       xorl    %r10d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r8d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %eax,%r15d
+       addl    (%rbp),%r12d
+       xorl    %eax,%r14d
+
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       movl    %ebx,%r11d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r11d
+       addl    %r12d,%edx
+       addl    %r12d,%r11d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r11d
+       movl    36(%rsi),%r12d
+       movl    %edx,%r13d
+       movl    %r11d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r8d,%edi
+
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%edi
+
+       movl    %r12d,36(%rsp)
+       xorl    %r11d,%r14d
+       andl    %edx,%edi
+
+       rorl    $5,%r13d
+       addl    %r10d,%r12d
+       xorl    %r9d,%edi
+
+       rorl    $11,%r14d
+       xorl    %edx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r11d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r11d,%r14d
+
+       xorl    %eax,%edi
+       rorl    $6,%r13d
+       movl    %eax,%r10d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r10d
+       addl    %r12d,%ecx
+       addl    %r12d,%r10d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r10d
+       movl    40(%rsi),%r12d
+       movl    %ecx,%r13d
+       movl    %r10d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %edx,%r15d
+
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r15d
+
+       movl    %r12d,40(%rsp)
+       xorl    %r10d,%r14d
+       andl    %ecx,%r15d
+
+       rorl    $5,%r13d
+       addl    %r9d,%r12d
+       xorl    %r8d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %ecx,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r10d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r10d,%r14d
+
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       movl    %r11d,%r9d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r9d
+       addl    %r12d,%ebx
+       addl    %r12d,%r9d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r9d
+       movl    44(%rsi),%r12d
+       movl    %ebx,%r13d
+       movl    %r9d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %ecx,%edi
+
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%edi
+
+       movl    %r12d,44(%rsp)
+       xorl    %r9d,%r14d
+       andl    %ebx,%edi
+
+       rorl    $5,%r13d
+       addl    %r8d,%r12d
+       xorl    %edx,%edi
+
+       rorl    $11,%r14d
+       xorl    %ebx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r9d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r9d,%r14d
+
+       xorl    %r10d,%edi
+       rorl    $6,%r13d
+       movl    %r10d,%r8d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r8d
+       addl    %r12d,%eax
+       addl    %r12d,%r8d
+
+       leaq    20(%rbp),%rbp
+       addl    %r14d,%r8d
+       movl    48(%rsi),%r12d
+       movl    %eax,%r13d
+       movl    %r8d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %ebx,%r15d
+
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r15d
+
+       movl    %r12d,48(%rsp)
+       xorl    %r8d,%r14d
+       andl    %eax,%r15d
+
+       rorl    $5,%r13d
+       addl    %edx,%r12d
+       xorl    %ecx,%r15d
+
+       rorl    $11,%r14d
+       xorl    %eax,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r8d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r8d,%r14d
+
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       movl    %r9d,%edx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%edx
+       addl    %r12d,%r11d
+       addl    %r12d,%edx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%edx
+       movl    52(%rsi),%r12d
+       movl    %r11d,%r13d
+       movl    %edx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %eax,%edi
+
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%edi
+
+       movl    %r12d,52(%rsp)
+       xorl    %edx,%r14d
+       andl    %r11d,%edi
+
+       rorl    $5,%r13d
+       addl    %ecx,%r12d
+       xorl    %ebx,%edi
+
+       rorl    $11,%r14d
+       xorl    %r11d,%r13d
+       addl    %edi,%r12d
+
+       movl    %edx,%edi
+       addl    (%rbp),%r12d
+       xorl    %edx,%r14d
+
+       xorl    %r8d,%edi
+       rorl    $6,%r13d
+       movl    %r8d,%ecx
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%ecx
+       addl    %r12d,%r10d
+       addl    %r12d,%ecx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%ecx
+       movl    56(%rsi),%r12d
+       movl    %r10d,%r13d
+       movl    %ecx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r11d,%r15d
+
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r15d
+
+       movl    %r12d,56(%rsp)
+       xorl    %ecx,%r14d
+       andl    %r10d,%r15d
+
+       rorl    $5,%r13d
+       addl    %ebx,%r12d
+       xorl    %eax,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r10d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %ecx,%r15d
+       addl    (%rbp),%r12d
+       xorl    %ecx,%r14d
+
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       movl    %edx,%ebx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%ebx
+       addl    %r12d,%r9d
+       addl    %r12d,%ebx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%ebx
+       movl    60(%rsi),%r12d
+       movl    %r9d,%r13d
+       movl    %ebx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r10d,%edi
+
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%edi
+
+       movl    %r12d,60(%rsp)
+       xorl    %ebx,%r14d
+       andl    %r9d,%edi
+
+       rorl    $5,%r13d
+       addl    %eax,%r12d
+       xorl    %r11d,%edi
+
+       rorl    $11,%r14d
+       xorl    %r9d,%r13d
+       addl    %edi,%r12d
+
+       movl    %ebx,%edi
+       addl    (%rbp),%r12d
+       xorl    %ebx,%r14d
+
+       xorl    %ecx,%edi
+       rorl    $6,%r13d
+       movl    %ecx,%eax
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%eax
+       addl    %r12d,%r8d
+       addl    %r12d,%eax
+
+       leaq    20(%rbp),%rbp
+       jmp     .Lrounds_16_xx
+.align 16
+.Lrounds_16_xx:
+       movl    4(%rsp),%r13d
+       movl    56(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%eax
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    36(%rsp),%r12d
+
+       addl    0(%rsp),%r12d
+       movl    %r8d,%r13d
+       addl    %r15d,%r12d
+       movl    %eax,%r14d
+       rorl    $14,%r13d
+       movl    %r9d,%r15d
+
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r15d
+
+       movl    %r12d,0(%rsp)
+       xorl    %eax,%r14d
+       andl    %r8d,%r15d
+
+       rorl    $5,%r13d
+       addl    %r11d,%r12d
+       xorl    %r10d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r8d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %eax,%r15d
+       addl    (%rbp),%r12d
+       xorl    %eax,%r14d
+
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       movl    %ebx,%r11d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r11d
+       addl    %r12d,%edx
+       addl    %r12d,%r11d
+
+       leaq    4(%rbp),%rbp
+       movl    8(%rsp),%r13d
+       movl    60(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r11d
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    40(%rsp),%r12d
+
+       addl    4(%rsp),%r12d
+       movl    %edx,%r13d
+       addl    %edi,%r12d
+       movl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r8d,%edi
+
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%edi
+
+       movl    %r12d,4(%rsp)
+       xorl    %r11d,%r14d
+       andl    %edx,%edi
+
+       rorl    $5,%r13d
+       addl    %r10d,%r12d
+       xorl    %r9d,%edi
+
+       rorl    $11,%r14d
+       xorl    %edx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r11d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r11d,%r14d
+
+       xorl    %eax,%edi
+       rorl    $6,%r13d
+       movl    %eax,%r10d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r10d
+       addl    %r12d,%ecx
+       addl    %r12d,%r10d
+
+       leaq    4(%rbp),%rbp
+       movl    12(%rsp),%r13d
+       movl    0(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r10d
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    44(%rsp),%r12d
+
+       addl    8(%rsp),%r12d
+       movl    %ecx,%r13d
+       addl    %r15d,%r12d
+       movl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %edx,%r15d
+
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r15d
+
+       movl    %r12d,8(%rsp)
+       xorl    %r10d,%r14d
+       andl    %ecx,%r15d
+
+       rorl    $5,%r13d
+       addl    %r9d,%r12d
+       xorl    %r8d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %ecx,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r10d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r10d,%r14d
+
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       movl    %r11d,%r9d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r9d
+       addl    %r12d,%ebx
+       addl    %r12d,%r9d
+
+       leaq    4(%rbp),%rbp
+       movl    16(%rsp),%r13d
+       movl    4(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r9d
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    48(%rsp),%r12d
+
+       addl    12(%rsp),%r12d
+       movl    %ebx,%r13d
+       addl    %edi,%r12d
+       movl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %ecx,%edi
+
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%edi
+
+       movl    %r12d,12(%rsp)
+       xorl    %r9d,%r14d
+       andl    %ebx,%edi
+
+       rorl    $5,%r13d
+       addl    %r8d,%r12d
+       xorl    %edx,%edi
+
+       rorl    $11,%r14d
+       xorl    %ebx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r9d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r9d,%r14d
+
+       xorl    %r10d,%edi
+       rorl    $6,%r13d
+       movl    %r10d,%r8d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r8d
+       addl    %r12d,%eax
+       addl    %r12d,%r8d
+
+       leaq    20(%rbp),%rbp
+       movl    20(%rsp),%r13d
+       movl    8(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r8d
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    52(%rsp),%r12d
+
+       addl    16(%rsp),%r12d
+       movl    %eax,%r13d
+       addl    %r15d,%r12d
+       movl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %ebx,%r15d
+
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r15d
+
+       movl    %r12d,16(%rsp)
+       xorl    %r8d,%r14d
+       andl    %eax,%r15d
+
+       rorl    $5,%r13d
+       addl    %edx,%r12d
+       xorl    %ecx,%r15d
+
+       rorl    $11,%r14d
+       xorl    %eax,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r8d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r8d,%r14d
+
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       movl    %r9d,%edx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%edx
+       addl    %r12d,%r11d
+       addl    %r12d,%edx
+
+       leaq    4(%rbp),%rbp
+       movl    24(%rsp),%r13d
+       movl    12(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%edx
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    56(%rsp),%r12d
+
+       addl    20(%rsp),%r12d
+       movl    %r11d,%r13d
+       addl    %edi,%r12d
+       movl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %eax,%edi
+
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%edi
+
+       movl    %r12d,20(%rsp)
+       xorl    %edx,%r14d
+       andl    %r11d,%edi
+
+       rorl    $5,%r13d
+       addl    %ecx,%r12d
+       xorl    %ebx,%edi
+
+       rorl    $11,%r14d
+       xorl    %r11d,%r13d
+       addl    %edi,%r12d
+
+       movl    %edx,%edi
+       addl    (%rbp),%r12d
+       xorl    %edx,%r14d
+
+       xorl    %r8d,%edi
+       rorl    $6,%r13d
+       movl    %r8d,%ecx
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%ecx
+       addl    %r12d,%r10d
+       addl    %r12d,%ecx
+
+       leaq    4(%rbp),%rbp
+       movl    28(%rsp),%r13d
+       movl    16(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%ecx
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    60(%rsp),%r12d
+
+       addl    24(%rsp),%r12d
+       movl    %r10d,%r13d
+       addl    %r15d,%r12d
+       movl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r11d,%r15d
+
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r15d
+
+       movl    %r12d,24(%rsp)
+       xorl    %ecx,%r14d
+       andl    %r10d,%r15d
+
+       rorl    $5,%r13d
+       addl    %ebx,%r12d
+       xorl    %eax,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r10d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %ecx,%r15d
+       addl    (%rbp),%r12d
+       xorl    %ecx,%r14d
+
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       movl    %edx,%ebx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%ebx
+       addl    %r12d,%r9d
+       addl    %r12d,%ebx
+
+       leaq    4(%rbp),%rbp
+       movl    32(%rsp),%r13d
+       movl    20(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%ebx
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    0(%rsp),%r12d
+
+       addl    28(%rsp),%r12d
+       movl    %r9d,%r13d
+       addl    %edi,%r12d
+       movl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r10d,%edi
+
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%edi
+
+       movl    %r12d,28(%rsp)
+       xorl    %ebx,%r14d
+       andl    %r9d,%edi
+
+       rorl    $5,%r13d
+       addl    %eax,%r12d
+       xorl    %r11d,%edi
+
+       rorl    $11,%r14d
+       xorl    %r9d,%r13d
+       addl    %edi,%r12d
+
+       movl    %ebx,%edi
+       addl    (%rbp),%r12d
+       xorl    %ebx,%r14d
+
+       xorl    %ecx,%edi
+       rorl    $6,%r13d
+       movl    %ecx,%eax
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%eax
+       addl    %r12d,%r8d
+       addl    %r12d,%eax
+
+       leaq    20(%rbp),%rbp
+       movl    36(%rsp),%r13d
+       movl    24(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%eax
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    4(%rsp),%r12d
+
+       addl    32(%rsp),%r12d
+       movl    %r8d,%r13d
+       addl    %r15d,%r12d
+       movl    %eax,%r14d
+       rorl    $14,%r13d
+       movl    %r9d,%r15d
+
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r15d
+
+       movl    %r12d,32(%rsp)
+       xorl    %eax,%r14d
+       andl    %r8d,%r15d
+
+       rorl    $5,%r13d
+       addl    %r11d,%r12d
+       xorl    %r10d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r8d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %eax,%r15d
+       addl    (%rbp),%r12d
+       xorl    %eax,%r14d
+
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       movl    %ebx,%r11d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r11d
+       addl    %r12d,%edx
+       addl    %r12d,%r11d
+
+       leaq    4(%rbp),%rbp
+       movl    40(%rsp),%r13d
+       movl    28(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r11d
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    8(%rsp),%r12d
+
+       addl    36(%rsp),%r12d
+       movl    %edx,%r13d
+       addl    %edi,%r12d
+       movl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r8d,%edi
+
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%edi
+
+       movl    %r12d,36(%rsp)
+       xorl    %r11d,%r14d
+       andl    %edx,%edi
+
+       rorl    $5,%r13d
+       addl    %r10d,%r12d
+       xorl    %r9d,%edi
+
+       rorl    $11,%r14d
+       xorl    %edx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r11d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r11d,%r14d
+
+       xorl    %eax,%edi
+       rorl    $6,%r13d
+       movl    %eax,%r10d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r10d
+       addl    %r12d,%ecx
+       addl    %r12d,%r10d
+
+       leaq    4(%rbp),%rbp
+       movl    44(%rsp),%r13d
+       movl    32(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r10d
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    12(%rsp),%r12d
+
+       addl    40(%rsp),%r12d
+       movl    %ecx,%r13d
+       addl    %r15d,%r12d
+       movl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %edx,%r15d
+
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r15d
+
+       movl    %r12d,40(%rsp)
+       xorl    %r10d,%r14d
+       andl    %ecx,%r15d
+
+       rorl    $5,%r13d
+       addl    %r9d,%r12d
+       xorl    %r8d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %ecx,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r10d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r10d,%r14d
+
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       movl    %r11d,%r9d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r9d
+       addl    %r12d,%ebx
+       addl    %r12d,%r9d
+
+       leaq    4(%rbp),%rbp
+       movl    48(%rsp),%r13d
+       movl    36(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r9d
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    16(%rsp),%r12d
+
+       addl    44(%rsp),%r12d
+       movl    %ebx,%r13d
+       addl    %edi,%r12d
+       movl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %ecx,%edi
+
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%edi
+
+       movl    %r12d,44(%rsp)
+       xorl    %r9d,%r14d
+       andl    %ebx,%edi
+
+       rorl    $5,%r13d
+       addl    %r8d,%r12d
+       xorl    %edx,%edi
+
+       rorl    $11,%r14d
+       xorl    %ebx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r9d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r9d,%r14d
+
+       xorl    %r10d,%edi
+       rorl    $6,%r13d
+       movl    %r10d,%r8d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r8d
+       addl    %r12d,%eax
+       addl    %r12d,%r8d
+
+       leaq    20(%rbp),%rbp
+       movl    52(%rsp),%r13d
+       movl    40(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r8d
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    20(%rsp),%r12d
+
+       addl    48(%rsp),%r12d
+       movl    %eax,%r13d
+       addl    %r15d,%r12d
+       movl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %ebx,%r15d
+
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r15d
+
+       movl    %r12d,48(%rsp)
+       xorl    %r8d,%r14d
+       andl    %eax,%r15d
+
+       rorl    $5,%r13d
+       addl    %edx,%r12d
+       xorl    %ecx,%r15d
+
+       rorl    $11,%r14d
+       xorl    %eax,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r8d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r8d,%r14d
+
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       movl    %r9d,%edx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%edx
+       addl    %r12d,%r11d
+       addl    %r12d,%edx
+
+       leaq    4(%rbp),%rbp
+       movl    56(%rsp),%r13d
+       movl    44(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%edx
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    24(%rsp),%r12d
+
+       addl    52(%rsp),%r12d
+       movl    %r11d,%r13d
+       addl    %edi,%r12d
+       movl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %eax,%edi
+
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%edi
+
+       movl    %r12d,52(%rsp)
+       xorl    %edx,%r14d
+       andl    %r11d,%edi
+
+       rorl    $5,%r13d
+       addl    %ecx,%r12d
+       xorl    %ebx,%edi
+
+       rorl    $11,%r14d
+       xorl    %r11d,%r13d
+       addl    %edi,%r12d
+
+       movl    %edx,%edi
+       addl    (%rbp),%r12d
+       xorl    %edx,%r14d
+
+       xorl    %r8d,%edi
+       rorl    $6,%r13d
+       movl    %r8d,%ecx
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%ecx
+       addl    %r12d,%r10d
+       addl    %r12d,%ecx
+
+       leaq    4(%rbp),%rbp
+       movl    60(%rsp),%r13d
+       movl    48(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%ecx
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    28(%rsp),%r12d
+
+       addl    56(%rsp),%r12d
+       movl    %r10d,%r13d
+       addl    %r15d,%r12d
+       movl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r11d,%r15d
+
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r15d
+
+       movl    %r12d,56(%rsp)
+       xorl    %ecx,%r14d
+       andl    %r10d,%r15d
+
+       rorl    $5,%r13d
+       addl    %ebx,%r12d
+       xorl    %eax,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r10d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %ecx,%r15d
+       addl    (%rbp),%r12d
+       xorl    %ecx,%r14d
+
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       movl    %edx,%ebx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%ebx
+       addl    %r12d,%r9d
+       addl    %r12d,%ebx
+
+       leaq    4(%rbp),%rbp
+       movl    0(%rsp),%r13d
+       movl    52(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%ebx
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    32(%rsp),%r12d
+
+       addl    60(%rsp),%r12d
+       movl    %r9d,%r13d
+       addl    %edi,%r12d
+       movl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r10d,%edi
+
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%edi
+
+       movl    %r12d,60(%rsp)
+       xorl    %ebx,%r14d
+       andl    %r9d,%edi
+
+       rorl    $5,%r13d
+       addl    %eax,%r12d
+       xorl    %r11d,%edi
+
+       rorl    $11,%r14d
+       xorl    %r9d,%r13d
+       addl    %edi,%r12d
+
+       movl    %ebx,%edi
+       addl    (%rbp),%r12d
+       xorl    %ebx,%r14d
+
+       xorl    %ecx,%edi
+       rorl    $6,%r13d
+       movl    %ecx,%eax
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%eax
+       addl    %r12d,%r8d
+       addl    %r12d,%eax
+
+       leaq    20(%rbp),%rbp
+       cmpb    $0,3(%rbp)
+       jnz     .Lrounds_16_xx
+
+       movq    64+0(%rsp),%rdi
+       addl    %r14d,%eax
+       leaq    64(%rsi),%rsi
+
+       addl    0(%rdi),%eax
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       addl    24(%rdi),%r10d
+       addl    28(%rdi),%r11d
+
+       cmpq    64+16(%rsp),%rsi
+
+       movl    %eax,0(%rdi)
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+       jb      .Lloop
+
+       movq    88(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       movq    -48(%rsi),%r15
+.cfi_restore   %r15
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  sha256_block_data_order,.-sha256_block_data_order
+.align 64
+.type  K256,@object
+K256:
+.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long  0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long  0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long  0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long  0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte  83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.type  sha256_block_data_order_shaext,@function
+.align 64
+sha256_block_data_order_shaext:
+_shaext_shortcut:
+       leaq    K256+128(%rip),%rcx
+       movdqu  (%rdi),%xmm1
+       movdqu  16(%rdi),%xmm2
+       movdqa  512-128(%rcx),%xmm7
+
+       pshufd  $0x1b,%xmm1,%xmm0
+       pshufd  $0xb1,%xmm1,%xmm1
+       pshufd  $0x1b,%xmm2,%xmm2
+       movdqa  %xmm7,%xmm8
+.byte  102,15,58,15,202,8
+       punpcklqdq      %xmm0,%xmm2
+       jmp     .Loop_shaext
+
+.align 16
+.Loop_shaext:
+       movdqu  (%rsi),%xmm3
+       movdqu  16(%rsi),%xmm4
+       movdqu  32(%rsi),%xmm5
+.byte  102,15,56,0,223
+       movdqu  48(%rsi),%xmm6
+
+       movdqa  0-128(%rcx),%xmm0
+       paddd   %xmm3,%xmm0
+.byte  102,15,56,0,231
+       movdqa  %xmm2,%xmm10
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       nop
+       movdqa  %xmm1,%xmm9
+.byte  15,56,203,202
+
+       movdqa  32-128(%rcx),%xmm0
+       paddd   %xmm4,%xmm0
+.byte  102,15,56,0,239
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       leaq    64(%rsi),%rsi
+.byte  15,56,204,220
+.byte  15,56,203,202
+
+       movdqa  64-128(%rcx),%xmm0
+       paddd   %xmm5,%xmm0
+.byte  102,15,56,0,247
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm6,%xmm7
+.byte  102,15,58,15,253,4
+       nop
+       paddd   %xmm7,%xmm3
+.byte  15,56,204,229
+.byte  15,56,203,202
+
+       movdqa  96-128(%rcx),%xmm0
+       paddd   %xmm6,%xmm0
+.byte  15,56,205,222
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm3,%xmm7
+.byte  102,15,58,15,254,4
+       nop
+       paddd   %xmm7,%xmm4
+.byte  15,56,204,238
+.byte  15,56,203,202
+       movdqa  128-128(%rcx),%xmm0
+       paddd   %xmm3,%xmm0
+.byte  15,56,205,227
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm4,%xmm7
+.byte  102,15,58,15,251,4
+       nop
+       paddd   %xmm7,%xmm5
+.byte  15,56,204,243
+.byte  15,56,203,202
+       movdqa  160-128(%rcx),%xmm0
+       paddd   %xmm4,%xmm0
+.byte  15,56,205,236
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm5,%xmm7
+.byte  102,15,58,15,252,4
+       nop
+       paddd   %xmm7,%xmm6
+.byte  15,56,204,220
+.byte  15,56,203,202
+       movdqa  192-128(%rcx),%xmm0
+       paddd   %xmm5,%xmm0
+.byte  15,56,205,245
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm6,%xmm7
+.byte  102,15,58,15,253,4
+       nop
+       paddd   %xmm7,%xmm3
+.byte  15,56,204,229
+.byte  15,56,203,202
+       movdqa  224-128(%rcx),%xmm0
+       paddd   %xmm6,%xmm0
+.byte  15,56,205,222
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm3,%xmm7
+.byte  102,15,58,15,254,4
+       nop
+       paddd   %xmm7,%xmm4
+.byte  15,56,204,238
+.byte  15,56,203,202
+       movdqa  256-128(%rcx),%xmm0
+       paddd   %xmm3,%xmm0
+.byte  15,56,205,227
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm4,%xmm7
+.byte  102,15,58,15,251,4
+       nop
+       paddd   %xmm7,%xmm5
+.byte  15,56,204,243
+.byte  15,56,203,202
+       movdqa  288-128(%rcx),%xmm0
+       paddd   %xmm4,%xmm0
+.byte  15,56,205,236
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm5,%xmm7
+.byte  102,15,58,15,252,4
+       nop
+       paddd   %xmm7,%xmm6
+.byte  15,56,204,220
+.byte  15,56,203,202
+       movdqa  320-128(%rcx),%xmm0
+       paddd   %xmm5,%xmm0
+.byte  15,56,205,245
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm6,%xmm7
+.byte  102,15,58,15,253,4
+       nop
+       paddd   %xmm7,%xmm3
+.byte  15,56,204,229
+.byte  15,56,203,202
+       movdqa  352-128(%rcx),%xmm0
+       paddd   %xmm6,%xmm0
+.byte  15,56,205,222
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm3,%xmm7
+.byte  102,15,58,15,254,4
+       nop
+       paddd   %xmm7,%xmm4
+.byte  15,56,204,238
+.byte  15,56,203,202
+       movdqa  384-128(%rcx),%xmm0
+       paddd   %xmm3,%xmm0
+.byte  15,56,205,227
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm4,%xmm7
+.byte  102,15,58,15,251,4
+       nop
+       paddd   %xmm7,%xmm5
+.byte  15,56,204,243
+.byte  15,56,203,202
+       movdqa  416-128(%rcx),%xmm0
+       paddd   %xmm4,%xmm0
+.byte  15,56,205,236
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm5,%xmm7
+.byte  102,15,58,15,252,4
+.byte  15,56,203,202
+       paddd   %xmm7,%xmm6
+
+       movdqa  448-128(%rcx),%xmm0
+       paddd   %xmm5,%xmm0
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+.byte  15,56,205,245
+       movdqa  %xmm8,%xmm7
+.byte  15,56,203,202
+
+       movdqa  480-128(%rcx),%xmm0
+       paddd   %xmm6,%xmm0
+       nop
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       decq    %rdx
+       nop
+.byte  15,56,203,202
+
+       paddd   %xmm10,%xmm2
+       paddd   %xmm9,%xmm1
+       jnz     .Loop_shaext
+
+       pshufd  $0xb1,%xmm2,%xmm2
+       pshufd  $0x1b,%xmm1,%xmm7
+       pshufd  $0xb1,%xmm1,%xmm1
+       punpckhqdq      %xmm2,%xmm1
+.byte  102,15,58,15,215,8
+
+       movdqu  %xmm1,(%rdi)
+       movdqu  %xmm2,16(%rdi)
+       .byte   0xf3,0xc3
+.size  sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
+.type  sha256_block_data_order_ssse3,@function
+.align 64
+sha256_block_data_order_ssse3:
+.cfi_startproc 
+.Lssse3_shortcut:
+       movq    %rsp,%rax
+.cfi_def_cfa_register  %rax
+       pushq   %rbx
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_offset    %r15,-56
+       shlq    $4,%rdx
+       subq    $96,%rsp
+       leaq    (%rsi,%rdx,4),%rdx
+       andq    $-64,%rsp
+       movq    %rdi,64+0(%rsp)
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+       movq    %rax,88(%rsp)
+.cfi_escape    0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_ssse3:
+
+       movl    0(%rdi),%eax
+       movl    4(%rdi),%ebx
+       movl    8(%rdi),%ecx
+       movl    12(%rdi),%edx
+       movl    16(%rdi),%r8d
+       movl    20(%rdi),%r9d
+       movl    24(%rdi),%r10d
+       movl    28(%rdi),%r11d
+
+
+       jmp     .Lloop_ssse3
+.align 16
+.Lloop_ssse3:
+       movdqa  K256+512(%rip),%xmm7
+       movdqu  0(%rsi),%xmm0
+       movdqu  16(%rsi),%xmm1
+       movdqu  32(%rsi),%xmm2
+.byte  102,15,56,0,199
+       movdqu  48(%rsi),%xmm3
+       leaq    K256(%rip),%rbp
+.byte  102,15,56,0,207
+       movdqa  0(%rbp),%xmm4
+       movdqa  32(%rbp),%xmm5
+.byte  102,15,56,0,215
+       paddd   %xmm0,%xmm4
+       movdqa  64(%rbp),%xmm6
+.byte  102,15,56,0,223
+       movdqa  96(%rbp),%xmm7
+       paddd   %xmm1,%xmm5
+       paddd   %xmm2,%xmm6
+       paddd   %xmm3,%xmm7
+       movdqa  %xmm4,0(%rsp)
+       movl    %eax,%r14d
+       movdqa  %xmm5,16(%rsp)
+       movl    %ebx,%edi
+       movdqa  %xmm6,32(%rsp)
+       xorl    %ecx,%edi
+       movdqa  %xmm7,48(%rsp)
+       movl    %r8d,%r13d
+       jmp     .Lssse3_00_47
+
+.align 16
+.Lssse3_00_47:
+       subq    $-128,%rbp
+       rorl    $14,%r13d
+       movdqa  %xmm1,%xmm4
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       movdqa  %xmm3,%xmm7
+       rorl    $9,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+.byte  102,15,58,15,224,4
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+.byte  102,15,58,15,250,4
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm4,%xmm5
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       movdqa  %xmm4,%xmm6
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       psrld   $3,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       paddd   %xmm7,%xmm0
+       rorl    $2,%r14d
+       addl    %r11d,%edx
+       psrld   $7,%xmm6
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       pshufd  $250,%xmm3,%xmm7
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       pslld   $14,%xmm5
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       pxor    %xmm6,%xmm4
+       rorl    $9,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       psrld   $11,%xmm6
+       xorl    %r11d,%r14d
+       pxor    %xmm5,%xmm4
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       pslld   $11,%xmm5
+       addl    4(%rsp),%r10d
+       movl    %r11d,%edi
+       pxor    %xmm6,%xmm4
+       xorl    %r9d,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm7,%xmm6
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       pxor    %xmm5,%xmm4
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       psrld   $10,%xmm7
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       paddd   %xmm4,%xmm0
+       rorl    $2,%r14d
+       addl    %r10d,%ecx
+       psrlq   $17,%xmm6
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       pxor    %xmm6,%xmm7
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       rorl    $9,%r14d
+       psrlq   $2,%xmm6
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       pshufd  $128,%xmm7,%xmm7
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       movl    %r10d,%r15d
+       psrldq  $8,%xmm7
+       xorl    %r8d,%r12d
+       rorl    $11,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       rorl    $6,%r13d
+       paddd   %xmm7,%xmm0
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       pshufd  $80,%xmm0,%xmm7
+       xorl    %r11d,%edi
+       rorl    $2,%r14d
+       addl    %r9d,%ebx
+       movdqa  %xmm7,%xmm6
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       psrld   $10,%xmm7
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       psrlq   $17,%xmm6
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $9,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       psrlq   $2,%xmm6
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       pxor    %xmm6,%xmm7
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       rorl    $11,%r14d
+       pshufd  $8,%xmm7,%xmm7
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       movdqa  0(%rbp),%xmm6
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       pslldq  $8,%xmm7
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       paddd   %xmm7,%xmm0
+       rorl    $2,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       paddd   %xmm0,%xmm6
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       movdqa  %xmm6,0(%rsp)
+       rorl    $14,%r13d
+       movdqa  %xmm2,%xmm4
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       movdqa  %xmm0,%xmm7
+       rorl    $9,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+.byte  102,15,58,15,225,4
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+.byte  102,15,58,15,251,4
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm4,%xmm5
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       movdqa  %xmm4,%xmm6
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       psrld   $3,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       paddd   %xmm7,%xmm1
+       rorl    $2,%r14d
+       addl    %edx,%r11d
+       psrld   $7,%xmm6
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       pshufd  $250,%xmm0,%xmm7
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       pslld   $14,%xmm5
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       pxor    %xmm6,%xmm4
+       rorl    $9,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       psrld   $11,%xmm6
+       xorl    %edx,%r14d
+       pxor    %xmm5,%xmm4
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       pslld   $11,%xmm5
+       addl    20(%rsp),%ecx
+       movl    %edx,%edi
+       pxor    %xmm6,%xmm4
+       xorl    %ebx,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm7,%xmm6
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       pxor    %xmm5,%xmm4
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       psrld   $10,%xmm7
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       paddd   %xmm4,%xmm1
+       rorl    $2,%r14d
+       addl    %ecx,%r10d
+       psrlq   $17,%xmm6
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       pxor    %xmm6,%xmm7
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       rorl    $9,%r14d
+       psrlq   $2,%xmm6
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       pshufd  $128,%xmm7,%xmm7
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       movl    %ecx,%r15d
+       psrldq  $8,%xmm7
+       xorl    %eax,%r12d
+       rorl    $11,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       rorl    $6,%r13d
+       paddd   %xmm7,%xmm1
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       pshufd  $80,%xmm1,%xmm7
+       xorl    %edx,%edi
+       rorl    $2,%r14d
+       addl    %ebx,%r9d
+       movdqa  %xmm7,%xmm6
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       psrld   $10,%xmm7
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       psrlq   $17,%xmm6
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $9,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       psrlq   $2,%xmm6
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       pxor    %xmm6,%xmm7
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       rorl    $11,%r14d
+       pshufd  $8,%xmm7,%xmm7
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       movdqa  32(%rbp),%xmm6
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       pslldq  $8,%xmm7
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       paddd   %xmm7,%xmm1
+       rorl    $2,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       paddd   %xmm1,%xmm6
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movdqa  %xmm6,16(%rsp)
+       rorl    $14,%r13d
+       movdqa  %xmm3,%xmm4
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       movdqa  %xmm1,%xmm7
+       rorl    $9,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+.byte  102,15,58,15,226,4
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+.byte  102,15,58,15,248,4
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm4,%xmm5
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       movdqa  %xmm4,%xmm6
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       psrld   $3,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       paddd   %xmm7,%xmm2
+       rorl    $2,%r14d
+       addl    %r11d,%edx
+       psrld   $7,%xmm6
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       pshufd  $250,%xmm1,%xmm7
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       pslld   $14,%xmm5
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       pxor    %xmm6,%xmm4
+       rorl    $9,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       psrld   $11,%xmm6
+       xorl    %r11d,%r14d
+       pxor    %xmm5,%xmm4
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       pslld   $11,%xmm5
+       addl    36(%rsp),%r10d
+       movl    %r11d,%edi
+       pxor    %xmm6,%xmm4
+       xorl    %r9d,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm7,%xmm6
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       pxor    %xmm5,%xmm4
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       psrld   $10,%xmm7
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       paddd   %xmm4,%xmm2
+       rorl    $2,%r14d
+       addl    %r10d,%ecx
+       psrlq   $17,%xmm6
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       pxor    %xmm6,%xmm7
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       rorl    $9,%r14d
+       psrlq   $2,%xmm6
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       pshufd  $128,%xmm7,%xmm7
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       movl    %r10d,%r15d
+       psrldq  $8,%xmm7
+       xorl    %r8d,%r12d
+       rorl    $11,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       rorl    $6,%r13d
+       paddd   %xmm7,%xmm2
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       pshufd  $80,%xmm2,%xmm7
+       xorl    %r11d,%edi
+       rorl    $2,%r14d
+       addl    %r9d,%ebx
+       movdqa  %xmm7,%xmm6
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       psrld   $10,%xmm7
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       psrlq   $17,%xmm6
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $9,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       psrlq   $2,%xmm6
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       pxor    %xmm6,%xmm7
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       rorl    $11,%r14d
+       pshufd  $8,%xmm7,%xmm7
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       movdqa  64(%rbp),%xmm6
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       pslldq  $8,%xmm7
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       paddd   %xmm7,%xmm2
+       rorl    $2,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       paddd   %xmm2,%xmm6
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       movdqa  %xmm6,32(%rsp)
+       rorl    $14,%r13d
+       movdqa  %xmm0,%xmm4
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       movdqa  %xmm2,%xmm7
+       rorl    $9,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+.byte  102,15,58,15,227,4
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+.byte  102,15,58,15,249,4
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm4,%xmm5
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       movdqa  %xmm4,%xmm6
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       psrld   $3,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       paddd   %xmm7,%xmm3
+       rorl    $2,%r14d
+       addl    %edx,%r11d
+       psrld   $7,%xmm6
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       pshufd  $250,%xmm2,%xmm7
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       pslld   $14,%xmm5
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       pxor    %xmm6,%xmm4
+       rorl    $9,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       psrld   $11,%xmm6
+       xorl    %edx,%r14d
+       pxor    %xmm5,%xmm4
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       pslld   $11,%xmm5
+       addl    52(%rsp),%ecx
+       movl    %edx,%edi
+       pxor    %xmm6,%xmm4
+       xorl    %ebx,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm7,%xmm6
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       pxor    %xmm5,%xmm4
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       psrld   $10,%xmm7
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       paddd   %xmm4,%xmm3
+       rorl    $2,%r14d
+       addl    %ecx,%r10d
+       psrlq   $17,%xmm6
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       pxor    %xmm6,%xmm7
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       rorl    $9,%r14d
+       psrlq   $2,%xmm6
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       pshufd  $128,%xmm7,%xmm7
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       movl    %ecx,%r15d
+       psrldq  $8,%xmm7
+       xorl    %eax,%r12d
+       rorl    $11,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       rorl    $6,%r13d
+       paddd   %xmm7,%xmm3
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       pshufd  $80,%xmm3,%xmm7
+       xorl    %edx,%edi
+       rorl    $2,%r14d
+       addl    %ebx,%r9d
+       movdqa  %xmm7,%xmm6
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       psrld   $10,%xmm7
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       psrlq   $17,%xmm6
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $9,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       psrlq   $2,%xmm6
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       pxor    %xmm6,%xmm7
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       rorl    $11,%r14d
+       pshufd  $8,%xmm7,%xmm7
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       movdqa  96(%rbp),%xmm6
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       pslldq  $8,%xmm7
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       paddd   %xmm7,%xmm3
+       rorl    $2,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       paddd   %xmm3,%xmm6
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movdqa  %xmm6,48(%rsp)
+       cmpb    $0,131(%rbp)
+       jne     .Lssse3_00_47
+       rorl    $14,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       rorl    $9,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       rorl    $11,%r14d
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       rorl    $2,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       rorl    $9,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       addl    4(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       rorl    $11,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       rorl    $2,%r14d
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       rorl    $9,%r14d
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       rorl    $11,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%edi
+       rorl    $2,%r14d
+       addl    %r9d,%ebx
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       rorl    $9,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       rorl    $11,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       rorl    $2,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       rorl    $9,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       rorl    $11,%r14d
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       rorl    $2,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       rorl    $9,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       addl    20(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       rorl    $11,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       rorl    $2,%r14d
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       rorl    $9,%r14d
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       rorl    $11,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%edi
+       rorl    $2,%r14d
+       addl    %ebx,%r9d
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       rorl    $9,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       rorl    $11,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       rorl    $2,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       rorl    $9,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       rorl    $11,%r14d
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       rorl    $2,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       rorl    $9,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       addl    36(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       rorl    $11,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       rorl    $2,%r14d
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       rorl    $9,%r14d
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       rorl    $11,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%edi
+       rorl    $2,%r14d
+       addl    %r9d,%ebx
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       rorl    $9,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       rorl    $11,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       rorl    $2,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       rorl    $9,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       rorl    $11,%r14d
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       rorl    $2,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       rorl    $9,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       addl    52(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       rorl    $11,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       rorl    $2,%r14d
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       rorl    $9,%r14d
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       rorl    $11,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%edi
+       rorl    $2,%r14d
+       addl    %ebx,%r9d
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       rorl    $9,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       rorl    $11,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       rorl    $2,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movq    64+0(%rsp),%rdi
+       movl    %r14d,%eax
+
+       addl    0(%rdi),%eax
+       leaq    64(%rsi),%rsi
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       addl    24(%rdi),%r10d
+       addl    28(%rdi),%r11d
+
+       cmpq    64+16(%rsp),%rsi
+
+       movl    %eax,0(%rdi)
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+       jb      .Lloop_ssse3
+
+       movq    88(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       movq    -48(%rsi),%r15
+.cfi_restore   %r15
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue_ssse3:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3
+.type  sha256_block_data_order_avx,@function
+.align 64
+sha256_block_data_order_avx:
+.cfi_startproc 
+.Lavx_shortcut:
+       movq    %rsp,%rax
+.cfi_def_cfa_register  %rax
+       pushq   %rbx
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_offset    %r15,-56
+       shlq    $4,%rdx
+       subq    $96,%rsp
+       leaq    (%rsi,%rdx,4),%rdx
+       andq    $-64,%rsp
+       movq    %rdi,64+0(%rsp)
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+       movq    %rax,88(%rsp)
+.cfi_escape    0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_avx:
+
+       vzeroupper
+       movl    0(%rdi),%eax
+       movl    4(%rdi),%ebx
+       movl    8(%rdi),%ecx
+       movl    12(%rdi),%edx
+       movl    16(%rdi),%r8d
+       movl    20(%rdi),%r9d
+       movl    24(%rdi),%r10d
+       movl    28(%rdi),%r11d
+       vmovdqa K256+512+32(%rip),%xmm8
+       vmovdqa K256+512+64(%rip),%xmm9
+       jmp     .Lloop_avx
+.align 16
+.Lloop_avx:
+       vmovdqa K256+512(%rip),%xmm7
+       vmovdqu 0(%rsi),%xmm0
+       vmovdqu 16(%rsi),%xmm1
+       vmovdqu 32(%rsi),%xmm2
+       vmovdqu 48(%rsi),%xmm3
+       vpshufb %xmm7,%xmm0,%xmm0
+       leaq    K256(%rip),%rbp
+       vpshufb %xmm7,%xmm1,%xmm1
+       vpshufb %xmm7,%xmm2,%xmm2
+       vpaddd  0(%rbp),%xmm0,%xmm4
+       vpshufb %xmm7,%xmm3,%xmm3
+       vpaddd  32(%rbp),%xmm1,%xmm5
+       vpaddd  64(%rbp),%xmm2,%xmm6
+       vpaddd  96(%rbp),%xmm3,%xmm7
+       vmovdqa %xmm4,0(%rsp)
+       movl    %eax,%r14d
+       vmovdqa %xmm5,16(%rsp)
+       movl    %ebx,%edi
+       vmovdqa %xmm6,32(%rsp)
+       xorl    %ecx,%edi
+       vmovdqa %xmm7,48(%rsp)
+       movl    %r8d,%r13d
+       jmp     .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+       subq    $-128,%rbp
+       vpalignr        $4,%xmm0,%xmm1,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       vpalignr        $4,%xmm2,%xmm3,%xmm7
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       vpaddd  %xmm7,%xmm0,%xmm0
+       xorl    %r8d,%r13d
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       xorl    %r10d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       addl    %r12d,%r11d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       vpshufd $250,%xmm3,%xmm7
+       shrdl   $2,%r14d,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       shrdl   $9,%r14d,%r14d
+       vpslld  $11,%xmm5,%xmm5
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    4(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       vpsrlq  $17,%xmm7,%xmm7
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       vpaddd  %xmm4,%xmm0,%xmm0
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %edx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r13d
+       vpshufb %xmm8,%xmm6,%xmm6
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       vpaddd  %xmm6,%xmm0,%xmm0
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       vpshufd $80,%xmm0,%xmm7
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       shrdl   $11,%r14d,%r14d
+       vpsrld  $10,%xmm7,%xmm6
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %r11d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r9d,%ebx
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       vpshufb %xmm9,%xmm6,%xmm6
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       vpaddd  %xmm6,%xmm0,%xmm0
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       vpaddd  0(%rbp),%xmm0,%xmm6
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       vmovdqa %xmm6,0(%rsp)
+       vpalignr        $4,%xmm1,%xmm2,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       vpalignr        $4,%xmm3,%xmm0,%xmm7
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       vpaddd  %xmm7,%xmm1,%xmm1
+       xorl    %eax,%r13d
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       xorl    %ecx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       addl    %r12d,%edx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       vpshufd $250,%xmm0,%xmm7
+       shrdl   $2,%r14d,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       shrdl   $9,%r14d,%r14d
+       vpslld  $11,%xmm5,%xmm5
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    20(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       vpsrlq  $17,%xmm7,%xmm7
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       vpaddd  %xmm4,%xmm1,%xmm1
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %r11d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r13d
+       vpshufb %xmm8,%xmm6,%xmm6
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       vpaddd  %xmm6,%xmm1,%xmm1
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       vpshufd $80,%xmm1,%xmm7
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       shrdl   $11,%r14d,%r14d
+       vpsrld  $10,%xmm7,%xmm6
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %edx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %ebx,%r9d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       vpshufb %xmm9,%xmm6,%xmm6
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       vpaddd  %xmm6,%xmm1,%xmm1
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       vpaddd  32(%rbp),%xmm1,%xmm6
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       vmovdqa %xmm6,16(%rsp)
+       vpalignr        $4,%xmm2,%xmm3,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       vpalignr        $4,%xmm0,%xmm1,%xmm7
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       vpaddd  %xmm7,%xmm2,%xmm2
+       xorl    %r8d,%r13d
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       xorl    %r10d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       addl    %r12d,%r11d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       vpshufd $250,%xmm1,%xmm7
+       shrdl   $2,%r14d,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       shrdl   $9,%r14d,%r14d
+       vpslld  $11,%xmm5,%xmm5
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    36(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       vpsrlq  $17,%xmm7,%xmm7
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       vpaddd  %xmm4,%xmm2,%xmm2
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %edx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r13d
+       vpshufb %xmm8,%xmm6,%xmm6
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       vpaddd  %xmm6,%xmm2,%xmm2
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       vpshufd $80,%xmm2,%xmm7
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       shrdl   $11,%r14d,%r14d
+       vpsrld  $10,%xmm7,%xmm6
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %r11d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r9d,%ebx
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       vpshufb %xmm9,%xmm6,%xmm6
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       vpaddd  %xmm6,%xmm2,%xmm2
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       vpaddd  64(%rbp),%xmm2,%xmm6
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       vmovdqa %xmm6,32(%rsp)
+       vpalignr        $4,%xmm3,%xmm0,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       vpalignr        $4,%xmm1,%xmm2,%xmm7
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       vpaddd  %xmm7,%xmm3,%xmm3
+       xorl    %eax,%r13d
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       xorl    %ecx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       addl    %r12d,%edx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       vpshufd $250,%xmm2,%xmm7
+       shrdl   $2,%r14d,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       shrdl   $9,%r14d,%r14d
+       vpslld  $11,%xmm5,%xmm5
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    52(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       vpsrlq  $17,%xmm7,%xmm7
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       vpaddd  %xmm4,%xmm3,%xmm3
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %r11d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r13d
+       vpshufb %xmm8,%xmm6,%xmm6
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       vpaddd  %xmm6,%xmm3,%xmm3
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       vpshufd $80,%xmm3,%xmm7
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       shrdl   $11,%r14d,%r14d
+       vpsrld  $10,%xmm7,%xmm6
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %edx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %ebx,%r9d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       vpshufb %xmm9,%xmm6,%xmm6
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       vpaddd  %xmm6,%xmm3,%xmm3
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       vpaddd  96(%rbp),%xmm3,%xmm6
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       vmovdqa %xmm6,48(%rsp)
+       cmpb    $0,131(%rbp)
+       jne     .Lavx_00_47
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       addl    4(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r9d,%ebx
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       addl    20(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %ebx,%r9d
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       addl    36(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r9d,%ebx
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       addl    52(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %ebx,%r9d
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movq    64+0(%rsp),%rdi
+       movl    %r14d,%eax
+
+       addl    0(%rdi),%eax
+       leaq    64(%rsi),%rsi
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       addl    24(%rdi),%r10d
+       addl    28(%rdi),%r11d
+
+       cmpq    64+16(%rsp),%rsi
+
+       movl    %eax,0(%rdi)
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+       jb      .Lloop_avx
+
+       movq    88(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       vzeroupper
+       movq    -48(%rsi),%r15
+.cfi_restore   %r15
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue_avx:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  sha256_block_data_order_avx,.-sha256_block_data_order_avx
+.type  sha256_block_data_order_avx2,@function
+.align 64
+sha256_block_data_order_avx2:
+.cfi_startproc 
+.Lavx2_shortcut:
+       movq    %rsp,%rax
+.cfi_def_cfa_register  %rax
+       pushq   %rbx
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_offset    %r15,-56
+       subq    $544,%rsp
+       shlq    $4,%rdx
+       andq    $-1024,%rsp
+       leaq    (%rsi,%rdx,4),%rdx
+       addq    $448,%rsp
+       movq    %rdi,64+0(%rsp)
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+       movq    %rax,88(%rsp)
+.cfi_escape    0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_avx2:
+
+       vzeroupper
+       subq    $-64,%rsi
+       movl    0(%rdi),%eax
+       movq    %rsi,%r12
+       movl    4(%rdi),%ebx
+       cmpq    %rdx,%rsi
+       movl    8(%rdi),%ecx
+       cmoveq  %rsp,%r12
+       movl    12(%rdi),%edx
+       movl    16(%rdi),%r8d
+       movl    20(%rdi),%r9d
+       movl    24(%rdi),%r10d
+       movl    28(%rdi),%r11d
+       vmovdqa K256+512+32(%rip),%ymm8
+       vmovdqa K256+512+64(%rip),%ymm9
+       jmp     .Loop_avx2
+.align 16
+.Loop_avx2:
+       vmovdqa K256+512(%rip),%ymm7
+       vmovdqu -64+0(%rsi),%xmm0
+       vmovdqu -64+16(%rsi),%xmm1
+       vmovdqu -64+32(%rsi),%xmm2
+       vmovdqu -64+48(%rsi),%xmm3
+
+       vinserti128     $1,(%r12),%ymm0,%ymm0
+       vinserti128     $1,16(%r12),%ymm1,%ymm1
+       vpshufb %ymm7,%ymm0,%ymm0
+       vinserti128     $1,32(%r12),%ymm2,%ymm2
+       vpshufb %ymm7,%ymm1,%ymm1
+       vinserti128     $1,48(%r12),%ymm3,%ymm3
+
+       leaq    K256(%rip),%rbp
+       vpshufb %ymm7,%ymm2,%ymm2
+       vpaddd  0(%rbp),%ymm0,%ymm4
+       vpshufb %ymm7,%ymm3,%ymm3
+       vpaddd  32(%rbp),%ymm1,%ymm5
+       vpaddd  64(%rbp),%ymm2,%ymm6
+       vpaddd  96(%rbp),%ymm3,%ymm7
+       vmovdqa %ymm4,0(%rsp)
+       xorl    %r14d,%r14d
+       vmovdqa %ymm5,32(%rsp)
+       leaq    -64(%rsp),%rsp
+       movl    %ebx,%edi
+       vmovdqa %ymm6,0(%rsp)
+       xorl    %ecx,%edi
+       vmovdqa %ymm7,32(%rsp)
+       movl    %r9d,%r12d
+       subq    $-32*4,%rbp
+       jmp     .Lavx2_00_47
+
+.align 16
+.Lavx2_00_47:
+       leaq    -64(%rsp),%rsp
+       vpalignr        $4,%ymm0,%ymm1,%ymm4
+       addl    0+128(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       vpalignr        $4,%ymm2,%ymm3,%ymm7
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       vpaddd  %ymm7,%ymm0,%ymm0
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       vpshufd $250,%ymm3,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    4+128(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       vpaddd  %ymm4,%ymm0,%ymm0
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    8+128(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       vpshufb %ymm8,%ymm6,%ymm6
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       vpaddd  %ymm6,%ymm0,%ymm0
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       vpshufd $80,%ymm0,%ymm7
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       vpsrld  $10,%ymm7,%ymm6
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       vpsrlq  $17,%ymm7,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    12+128(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       vpshufb %ymm9,%ymm6,%ymm6
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       vpaddd  %ymm6,%ymm0,%ymm0
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       vpaddd  0(%rbp),%ymm0,%ymm6
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       vmovdqa %ymm6,0(%rsp)
+       vpalignr        $4,%ymm1,%ymm2,%ymm4
+       addl    32+128(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       vpalignr        $4,%ymm3,%ymm0,%ymm7
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       vpaddd  %ymm7,%ymm1,%ymm1
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       vpshufd $250,%ymm0,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    36+128(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       vpaddd  %ymm4,%ymm1,%ymm1
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    40+128(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       vpshufb %ymm8,%ymm6,%ymm6
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       vpaddd  %ymm6,%ymm1,%ymm1
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       vpshufd $80,%ymm1,%ymm7
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       vpsrld  $10,%ymm7,%ymm6
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       vpsrlq  $17,%ymm7,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    44+128(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       vpshufb %ymm9,%ymm6,%ymm6
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       vpaddd  %ymm6,%ymm1,%ymm1
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       vpaddd  32(%rbp),%ymm1,%ymm6
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       vmovdqa %ymm6,32(%rsp)
+       leaq    -64(%rsp),%rsp
+       vpalignr        $4,%ymm2,%ymm3,%ymm4
+       addl    0+128(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       vpalignr        $4,%ymm0,%ymm1,%ymm7
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       vpaddd  %ymm7,%ymm2,%ymm2
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       vpshufd $250,%ymm1,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    4+128(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       vpaddd  %ymm4,%ymm2,%ymm2
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    8+128(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       vpshufb %ymm8,%ymm6,%ymm6
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       vpaddd  %ymm6,%ymm2,%ymm2
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       vpshufd $80,%ymm2,%ymm7
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       vpsrld  $10,%ymm7,%ymm6
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       vpsrlq  $17,%ymm7,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    12+128(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       vpshufb %ymm9,%ymm6,%ymm6
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       vpaddd  %ymm6,%ymm2,%ymm2
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       vpaddd  64(%rbp),%ymm2,%ymm6
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       vmovdqa %ymm6,0(%rsp)
+       vpalignr        $4,%ymm3,%ymm0,%ymm4
+       addl    32+128(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       vpalignr        $4,%ymm1,%ymm2,%ymm7
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       vpaddd  %ymm7,%ymm3,%ymm3
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       vpshufd $250,%ymm2,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    36+128(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       vpaddd  %ymm4,%ymm3,%ymm3
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    40+128(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       vpshufb %ymm8,%ymm6,%ymm6
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       vpaddd  %ymm6,%ymm3,%ymm3
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       vpshufd $80,%ymm3,%ymm7
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       vpsrld  $10,%ymm7,%ymm6
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       vpsrlq  $17,%ymm7,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    44+128(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       vpshufb %ymm9,%ymm6,%ymm6
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       vpaddd  %ymm6,%ymm3,%ymm3
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       vpaddd  96(%rbp),%ymm3,%ymm6
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       vmovdqa %ymm6,32(%rsp)
+       leaq    128(%rbp),%rbp
+       cmpb    $0,3(%rbp)
+       jne     .Lavx2_00_47
+       addl    0+64(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       addl    4+64(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       addl    8+64(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       addl    12+64(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       addl    32+64(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       addl    36+64(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       addl    40+64(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       addl    44+64(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       addl    0(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       addl    4(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       addl    8(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       addl    12(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       addl    32(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       addl    36(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       addl    40(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       addl    44(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       movq    512(%rsp),%rdi
+       addl    %r14d,%eax
+
+       leaq    448(%rsp),%rbp
+
+       addl    0(%rdi),%eax
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       addl    24(%rdi),%r10d
+       addl    28(%rdi),%r11d
+
+       movl    %eax,0(%rdi)
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+
+       cmpq    80(%rbp),%rsi
+       je      .Ldone_avx2
+
+       xorl    %r14d,%r14d
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       movl    %r9d,%r12d
+       jmp     .Lower_avx2
+.align 16
+.Lower_avx2:
+       addl    0+16(%rbp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       addl    4+16(%rbp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       addl    8+16(%rbp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       addl    12+16(%rbp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       addl    32+16(%rbp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       addl    36+16(%rbp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       addl    40+16(%rbp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       addl    44+16(%rbp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       leaq    -64(%rbp),%rbp
+       cmpq    %rsp,%rbp
+       jae     .Lower_avx2
+
+       movq    512(%rsp),%rdi
+       addl    %r14d,%eax
+
+       leaq    448(%rsp),%rsp
+
+       addl    0(%rdi),%eax
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       leaq    128(%rsi),%rsi
+       addl    24(%rdi),%r10d
+       movq    %rsi,%r12
+       addl    28(%rdi),%r11d
+       cmpq    64+16(%rsp),%rsi
+
+       movl    %eax,0(%rdi)
+       cmoveq  %rsp,%r12
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+
+       jbe     .Loop_avx2
+       leaq    (%rsp),%rbp
+
+.Ldone_avx2:
+       leaq    (%rbp),%rsp
+       movq    88(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       vzeroupper
+       movq    -48(%rsi),%r15
+.cfi_restore   %r15
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue_avx2:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  sha256_block_data_order_avx2,.-sha256_block_data_order_avx2
+
+.section .note.GNU-stack,"",%progbits
index 0b99b22ec98be3a1c2271a8ba9c48f8f09c15136..481c77715493544895b1fe5599d916816091f7d5 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,6 @@
 #
 # *** This file is auto-generated ***
 #
-.file  "sha512-586.s"
 .text
 .globl sha512_block_data_order
 .type  sha512_block_data_order,@function
@@ -594,6 +593,8 @@ sha512_block_data_order:
 .long  4234509866,1501505948
 .long  987167468,1607167915
 .long  1246189591,1816402316
+.long  67438087,66051
+.long  202182159,134810123
 .size  sha512_block_data_order,.-.L_sha512_block_data_order_begin
 .byte  83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
 .byte  110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
@@ -601,7 +602,4 @@ sha512_block_data_order:
 .byte  112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
 .byte  62,0
 
-
 .section .note.GNU-stack,"",%progbits
-
-
index d51d8169bcfe5d4f2424bfcaa2fedd4f8a5b804d..e384d7e9e8337cf897be5ca677f1f3e87e8a52c8 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 .text  
 
 
-.globl sha256_block_data_order
-.type  sha256_block_data_order,@function
+.globl sha512_block_data_order
+.type  sha512_block_data_order,@function
 .align 16
-sha256_block_data_order:
+sha512_block_data_order:
+.cfi_startproc 
        leaq    _gnutls_x86_cpuid_s(%rip),%r11
        movl    0(%r11),%r9d
        movl    4(%r11),%r10d
        movl    8(%r11),%r11d
-       testl   $512,%r10d
-       jnz     .Lssse3_shortcut
+       testl   $2048,%r10d
+       jnz     .Lxop_shortcut
+       andl    $296,%r11d
+       cmpl    $296,%r11d
+       je      .Lavx2_shortcut
+       andl    $1073741824,%r9d
+       andl    $268435968,%r10d
+       orl     %r9d,%r10d
+       cmpl    $1342177792,%r10d
+       je      .Lavx_shortcut
+       movq    %rsp,%rax
+.cfi_def_cfa_register  %rax
        pushq   %rbx
+.cfi_offset    %rbx,-16
        pushq   %rbp
+.cfi_offset    %rbp,-24
        pushq   %r12
+.cfi_offset    %r12,-32
        pushq   %r13
+.cfi_offset    %r13,-40
        pushq   %r14
+.cfi_offset    %r14,-48
        pushq   %r15
-       movq    %rsp,%r11
+.cfi_offset    %r15,-56
        shlq    $4,%rdx
-       subq    $64+32,%rsp
-       leaq    (%rsi,%rdx,4),%rdx
+       subq    $128+32,%rsp
+       leaq    (%rsi,%rdx,8),%rdx
        andq    $-64,%rsp
-       movq    %rdi,64+0(%rsp)
-       movq    %rsi,64+8(%rsp)
-       movq    %rdx,64+16(%rsp)
-       movq    %r11,64+24(%rsp)
+       movq    %rdi,128+0(%rsp)
+       movq    %rsi,128+8(%rsp)
+       movq    %rdx,128+16(%rsp)
+       movq    %rax,152(%rsp)
+.cfi_escape    0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
 .Lprologue:
 
-       movl    0(%rdi),%eax
-       movl    4(%rdi),%ebx
-       movl    8(%rdi),%ecx
-       movl    12(%rdi),%edx
-       movl    16(%rdi),%r8d
-       movl    20(%rdi),%r9d
-       movl    24(%rdi),%r10d
-       movl    28(%rdi),%r11d
+       movq    0(%rdi),%rax
+       movq    8(%rdi),%rbx
+       movq    16(%rdi),%rcx
+       movq    24(%rdi),%rdx
+       movq    32(%rdi),%r8
+       movq    40(%rdi),%r9
+       movq    48(%rdi),%r10
+       movq    56(%rdi),%r11
        jmp     .Lloop
 
 .align 16
 .Lloop:
-       movl    %ebx,%edi
-       leaq    K256(%rip),%rbp
-       xorl    %ecx,%edi
-       movl    0(%rsi),%r12d
-       movl    %r8d,%r13d
-       movl    %eax,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r9d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r8d,%r13d
-       xorl    %r10d,%r15d
-
-       movl    %r12d,0(%rsp)
-       xorl    %eax,%r14d
-       andl    %r8d,%r15d
-
-       rorl    $5,%r13d
-       addl    %r11d,%r12d
-       xorl    %r10d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r8d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %eax,%r15d
-       addl    (%rbp),%r12d
-       xorl    %eax,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ebx,%r15d
-       movl    %ebx,%r11d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r11d
-       addl    %r12d,%edx
-       addl    %r12d,%r11d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r11d
-
-       movl    4(%rsi),%r12d
-       movl    %edx,%r13d
-       movl    %r11d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r8d,%edi
-
-       rorl    $9,%r14d
-       xorl    %edx,%r13d
-       xorl    %r9d,%edi
-
-       movl    %r12d,4(%rsp)
-       xorl    %r11d,%r14d
-       andl    %edx,%edi
-
-       rorl    $5,%r13d
-       addl    %r10d,%r12d
-       xorl    %r9d,%edi
-
-       rorl    $11,%r14d
-       xorl    %edx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r11d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r11d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %eax,%edi
-       movl    %eax,%r10d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r10d
-       addl    %r12d,%ecx
-       addl    %r12d,%r10d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r10d
-
-       movl    8(%rsi),%r12d
-       movl    %ecx,%r13d
-       movl    %r10d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %edx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %ecx,%r13d
-       xorl    %r8d,%r15d
-
-       movl    %r12d,8(%rsp)
-       xorl    %r10d,%r14d
-       andl    %ecx,%r15d
-
-       rorl    $5,%r13d
-       addl    %r9d,%r12d
-       xorl    %r8d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %ecx,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r10d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r10d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r11d,%r15d
-       movl    %r11d,%r9d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r9d
-       addl    %r12d,%ebx
-       addl    %r12d,%r9d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r9d
-
-       movl    12(%rsi),%r12d
-       movl    %ebx,%r13d
-       movl    %r9d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %ecx,%edi
-
-       rorl    $9,%r14d
-       xorl    %ebx,%r13d
-       xorl    %edx,%edi
-
-       movl    %r12d,12(%rsp)
-       xorl    %r9d,%r14d
-       andl    %ebx,%edi
-
-       rorl    $5,%r13d
-       addl    %r8d,%r12d
-       xorl    %edx,%edi
-
-       rorl    $11,%r14d
-       xorl    %ebx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r9d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r9d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r10d,%edi
-       movl    %r10d,%r8d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r8d
-       addl    %r12d,%eax
-       addl    %r12d,%r8d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%r8d
-
-       movl    16(%rsi),%r12d
-       movl    %eax,%r13d
-       movl    %r8d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %ebx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %eax,%r13d
-       xorl    %ecx,%r15d
-
-       movl    %r12d,16(%rsp)
-       xorl    %r8d,%r14d
-       andl    %eax,%r15d
-
-       rorl    $5,%r13d
-       addl    %edx,%r12d
-       xorl    %ecx,%r15d
-
-       rorl    $11,%r14d
-       xorl    %eax,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r8d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r8d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r9d,%r15d
-       movl    %r9d,%edx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%edx
-       addl    %r12d,%r11d
-       addl    %r12d,%edx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%edx
-
-       movl    20(%rsi),%r12d
-       movl    %r11d,%r13d
-       movl    %edx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %eax,%edi
-
-       rorl    $9,%r14d
-       xorl    %r11d,%r13d
-       xorl    %ebx,%edi
-
-       movl    %r12d,20(%rsp)
-       xorl    %edx,%r14d
-       andl    %r11d,%edi
-
-       rorl    $5,%r13d
-       addl    %ecx,%r12d
-       xorl    %ebx,%edi
-
-       rorl    $11,%r14d
-       xorl    %r11d,%r13d
-       addl    %edi,%r12d
-
-       movl    %edx,%edi
-       addl    (%rbp),%r12d
-       xorl    %edx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r8d,%edi
-       movl    %r8d,%ecx
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%ecx
-       addl    %r12d,%r10d
-       addl    %r12d,%ecx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ecx
-
-       movl    24(%rsi),%r12d
-       movl    %r10d,%r13d
-       movl    %ecx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r11d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r10d,%r13d
-       xorl    %eax,%r15d
-
-       movl    %r12d,24(%rsp)
-       xorl    %ecx,%r14d
-       andl    %r10d,%r15d
-
-       rorl    $5,%r13d
-       addl    %ebx,%r12d
-       xorl    %eax,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r10d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %ecx,%r15d
-       addl    (%rbp),%r12d
-       xorl    %ecx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %edx,%r15d
-       movl    %edx,%ebx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%ebx
-       addl    %r12d,%r9d
-       addl    %r12d,%ebx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ebx
-
-       movl    28(%rsi),%r12d
-       movl    %r9d,%r13d
-       movl    %ebx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r10d,%edi
-
-       rorl    $9,%r14d
-       xorl    %r9d,%r13d
-       xorl    %r11d,%edi
-
-       movl    %r12d,28(%rsp)
-       xorl    %ebx,%r14d
-       andl    %r9d,%edi
-
-       rorl    $5,%r13d
-       addl    %eax,%r12d
-       xorl    %r11d,%edi
-
-       rorl    $11,%r14d
-       xorl    %r9d,%r13d
-       addl    %edi,%r12d
-
-       movl    %ebx,%edi
-       addl    (%rbp),%r12d
-       xorl    %ebx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ecx,%edi
-       movl    %ecx,%eax
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%eax
-       addl    %r12d,%r8d
-       addl    %r12d,%eax
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%eax
-
-       movl    32(%rsi),%r12d
-       movl    %r8d,%r13d
-       movl    %eax,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r9d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r8d,%r13d
-       xorl    %r10d,%r15d
-
-       movl    %r12d,32(%rsp)
-       xorl    %eax,%r14d
-       andl    %r8d,%r15d
-
-       rorl    $5,%r13d
-       addl    %r11d,%r12d
-       xorl    %r10d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r8d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %eax,%r15d
-       addl    (%rbp),%r12d
-       xorl    %eax,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ebx,%r15d
-       movl    %ebx,%r11d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r11d
-       addl    %r12d,%edx
-       addl    %r12d,%r11d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r11d
-
-       movl    36(%rsi),%r12d
-       movl    %edx,%r13d
-       movl    %r11d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r8d,%edi
-
-       rorl    $9,%r14d
-       xorl    %edx,%r13d
-       xorl    %r9d,%edi
-
-       movl    %r12d,36(%rsp)
-       xorl    %r11d,%r14d
-       andl    %edx,%edi
-
-       rorl    $5,%r13d
-       addl    %r10d,%r12d
-       xorl    %r9d,%edi
-
-       rorl    $11,%r14d
-       xorl    %edx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r11d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r11d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %eax,%edi
-       movl    %eax,%r10d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r10d
-       addl    %r12d,%ecx
-       addl    %r12d,%r10d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r10d
-
-       movl    40(%rsi),%r12d
-       movl    %ecx,%r13d
-       movl    %r10d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %edx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %ecx,%r13d
-       xorl    %r8d,%r15d
-
-       movl    %r12d,40(%rsp)
-       xorl    %r10d,%r14d
-       andl    %ecx,%r15d
-
-       rorl    $5,%r13d
-       addl    %r9d,%r12d
-       xorl    %r8d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %ecx,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r10d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r10d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r11d,%r15d
-       movl    %r11d,%r9d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r9d
-       addl    %r12d,%ebx
-       addl    %r12d,%r9d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r9d
-
-       movl    44(%rsi),%r12d
-       movl    %ebx,%r13d
-       movl    %r9d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %ecx,%edi
-
-       rorl    $9,%r14d
-       xorl    %ebx,%r13d
-       xorl    %edx,%edi
-
-       movl    %r12d,44(%rsp)
-       xorl    %r9d,%r14d
-       andl    %ebx,%edi
-
-       rorl    $5,%r13d
-       addl    %r8d,%r12d
-       xorl    %edx,%edi
-
-       rorl    $11,%r14d
-       xorl    %ebx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r9d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r9d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r10d,%edi
-       movl    %r10d,%r8d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r8d
-       addl    %r12d,%eax
-       addl    %r12d,%r8d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%r8d
-
-       movl    48(%rsi),%r12d
-       movl    %eax,%r13d
-       movl    %r8d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %ebx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %eax,%r13d
-       xorl    %ecx,%r15d
-
-       movl    %r12d,48(%rsp)
-       xorl    %r8d,%r14d
-       andl    %eax,%r15d
-
-       rorl    $5,%r13d
-       addl    %edx,%r12d
-       xorl    %ecx,%r15d
-
-       rorl    $11,%r14d
-       xorl    %eax,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r8d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r8d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r9d,%r15d
-       movl    %r9d,%edx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%edx
-       addl    %r12d,%r11d
-       addl    %r12d,%edx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%edx
-
-       movl    52(%rsi),%r12d
-       movl    %r11d,%r13d
-       movl    %edx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %eax,%edi
-
-       rorl    $9,%r14d
-       xorl    %r11d,%r13d
-       xorl    %ebx,%edi
-
-       movl    %r12d,52(%rsp)
-       xorl    %edx,%r14d
-       andl    %r11d,%edi
-
-       rorl    $5,%r13d
-       addl    %ecx,%r12d
-       xorl    %ebx,%edi
-
-       rorl    $11,%r14d
-       xorl    %r11d,%r13d
-       addl    %edi,%r12d
-
-       movl    %edx,%edi
-       addl    (%rbp),%r12d
-       xorl    %edx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r8d,%edi
-       movl    %r8d,%ecx
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%ecx
-       addl    %r12d,%r10d
-       addl    %r12d,%ecx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ecx
-
-       movl    56(%rsi),%r12d
-       movl    %r10d,%r13d
-       movl    %ecx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r11d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r10d,%r13d
-       xorl    %eax,%r15d
-
-       movl    %r12d,56(%rsp)
-       xorl    %ecx,%r14d
-       andl    %r10d,%r15d
-
-       rorl    $5,%r13d
-       addl    %ebx,%r12d
-       xorl    %eax,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r10d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %ecx,%r15d
-       addl    (%rbp),%r12d
-       xorl    %ecx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %edx,%r15d
-       movl    %edx,%ebx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%ebx
-       addl    %r12d,%r9d
-       addl    %r12d,%ebx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ebx
-
-       movl    60(%rsi),%r12d
-       movl    %r9d,%r13d
-       movl    %ebx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r10d,%edi
-
-       rorl    $9,%r14d
-       xorl    %r9d,%r13d
-       xorl    %r11d,%edi
-
-       movl    %r12d,60(%rsp)
-       xorl    %ebx,%r14d
-       andl    %r9d,%edi
-
-       rorl    $5,%r13d
-       addl    %eax,%r12d
-       xorl    %r11d,%edi
-
-       rorl    $11,%r14d
-       xorl    %r9d,%r13d
-       addl    %edi,%r12d
-
-       movl    %ebx,%edi
-       addl    (%rbp),%r12d
-       xorl    %ebx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ecx,%edi
-       movl    %ecx,%eax
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%eax
-       addl    %r12d,%r8d
-       addl    %r12d,%eax
-       movl    4(%rsp),%r13d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%eax
-
+       movq    %rbx,%rdi
+       leaq    K512(%rip),%rbp
+       xorq    %rcx,%rdi
+       movq    0(%rsi),%r12
+       movq    %r8,%r13
+       movq    %rax,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r9,%r15
+
+       xorq    %r8,%r13
+       rorq    $5,%r14
+       xorq    %r10,%r15
+
+       movq    %r12,0(%rsp)
+       xorq    %rax,%r14
+       andq    %r8,%r15
+
+       rorq    $4,%r13
+       addq    %r11,%r12
+       xorq    %r10,%r15
+
+       rorq    $6,%r14
+       xorq    %r8,%r13
+       addq    %r15,%r12
+
+       movq    %rax,%r15
+       addq    (%rbp),%r12
+       xorq    %rax,%r14
+
+       xorq    %rbx,%r15
+       rorq    $14,%r13
+       movq    %rbx,%r11
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r11
+       addq    %r12,%rdx
+       addq    %r12,%r11
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%r11
+       movq    8(%rsi),%r12
+       movq    %rdx,%r13
+       movq    %r11,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r8,%rdi
+
+       xorq    %rdx,%r13
+       rorq    $5,%r14
+       xorq    %r9,%rdi
+
+       movq    %r12,8(%rsp)
+       xorq    %r11,%r14
+       andq    %rdx,%rdi
+
+       rorq    $4,%r13
+       addq    %r10,%r12
+       xorq    %r9,%rdi
+
+       rorq    $6,%r14
+       xorq    %rdx,%r13
+       addq    %rdi,%r12
+
+       movq    %r11,%rdi
+       addq    (%rbp),%r12
+       xorq    %r11,%r14
+
+       xorq    %rax,%rdi
+       rorq    $14,%r13
+       movq    %rax,%r10
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r10
+       addq    %r12,%rcx
+       addq    %r12,%r10
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%r10
+       movq    16(%rsi),%r12
+       movq    %rcx,%r13
+       movq    %r10,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rdx,%r15
+
+       xorq    %rcx,%r13
+       rorq    $5,%r14
+       xorq    %r8,%r15
+
+       movq    %r12,16(%rsp)
+       xorq    %r10,%r14
+       andq    %rcx,%r15
+
+       rorq    $4,%r13
+       addq    %r9,%r12
+       xorq    %r8,%r15
+
+       rorq    $6,%r14
+       xorq    %rcx,%r13
+       addq    %r15,%r12
+
+       movq    %r10,%r15
+       addq    (%rbp),%r12
+       xorq    %r10,%r14
+
+       xorq    %r11,%r15
+       rorq    $14,%r13
+       movq    %r11,%r9
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r9
+       addq    %r12,%rbx
+       addq    %r12,%r9
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%r9
+       movq    24(%rsi),%r12
+       movq    %rbx,%r13
+       movq    %r9,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rcx,%rdi
+
+       xorq    %rbx,%r13
+       rorq    $5,%r14
+       xorq    %rdx,%rdi
+
+       movq    %r12,24(%rsp)
+       xorq    %r9,%r14
+       andq    %rbx,%rdi
+
+       rorq    $4,%r13
+       addq    %r8,%r12
+       xorq    %rdx,%rdi
+
+       rorq    $6,%r14
+       xorq    %rbx,%r13
+       addq    %rdi,%r12
+
+       movq    %r9,%rdi
+       addq    (%rbp),%r12
+       xorq    %r9,%r14
+
+       xorq    %r10,%rdi
+       rorq    $14,%r13
+       movq    %r10,%r8
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r8
+       addq    %r12,%rax
+       addq    %r12,%r8
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%r8
+       movq    32(%rsi),%r12
+       movq    %rax,%r13
+       movq    %r8,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rbx,%r15
+
+       xorq    %rax,%r13
+       rorq    $5,%r14
+       xorq    %rcx,%r15
+
+       movq    %r12,32(%rsp)
+       xorq    %r8,%r14
+       andq    %rax,%r15
+
+       rorq    $4,%r13
+       addq    %rdx,%r12
+       xorq    %rcx,%r15
+
+       rorq    $6,%r14
+       xorq    %rax,%r13
+       addq    %r15,%r12
+
+       movq    %r8,%r15
+       addq    (%rbp),%r12
+       xorq    %r8,%r14
+
+       xorq    %r9,%r15
+       rorq    $14,%r13
+       movq    %r9,%rdx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rdx
+       addq    %r12,%r11
+       addq    %r12,%rdx
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%rdx
+       movq    40(%rsi),%r12
+       movq    %r11,%r13
+       movq    %rdx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rax,%rdi
+
+       xorq    %r11,%r13
+       rorq    $5,%r14
+       xorq    %rbx,%rdi
+
+       movq    %r12,40(%rsp)
+       xorq    %rdx,%r14
+       andq    %r11,%rdi
+
+       rorq    $4,%r13
+       addq    %rcx,%r12
+       xorq    %rbx,%rdi
+
+       rorq    $6,%r14
+       xorq    %r11,%r13
+       addq    %rdi,%r12
+
+       movq    %rdx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rdx,%r14
+
+       xorq    %r8,%rdi
+       rorq    $14,%r13
+       movq    %r8,%rcx
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rcx
+       addq    %r12,%r10
+       addq    %r12,%rcx
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%rcx
+       movq    48(%rsi),%r12
+       movq    %r10,%r13
+       movq    %rcx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r11,%r15
+
+       xorq    %r10,%r13
+       rorq    $5,%r14
+       xorq    %rax,%r15
+
+       movq    %r12,48(%rsp)
+       xorq    %rcx,%r14
+       andq    %r10,%r15
+
+       rorq    $4,%r13
+       addq    %rbx,%r12
+       xorq    %rax,%r15
+
+       rorq    $6,%r14
+       xorq    %r10,%r13
+       addq    %r15,%r12
+
+       movq    %rcx,%r15
+       addq    (%rbp),%r12
+       xorq    %rcx,%r14
+
+       xorq    %rdx,%r15
+       rorq    $14,%r13
+       movq    %rdx,%rbx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rbx
+       addq    %r12,%r9
+       addq    %r12,%rbx
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%rbx
+       movq    56(%rsi),%r12
+       movq    %r9,%r13
+       movq    %rbx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r10,%rdi
+
+       xorq    %r9,%r13
+       rorq    $5,%r14
+       xorq    %r11,%rdi
+
+       movq    %r12,56(%rsp)
+       xorq    %rbx,%r14
+       andq    %r9,%rdi
+
+       rorq    $4,%r13
+       addq    %rax,%r12
+       xorq    %r11,%rdi
+
+       rorq    $6,%r14
+       xorq    %r9,%r13
+       addq    %rdi,%r12
+
+       movq    %rbx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rbx,%r14
+
+       xorq    %rcx,%rdi
+       rorq    $14,%r13
+       movq    %rcx,%rax
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rax
+       addq    %r12,%r8
+       addq    %r12,%rax
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%rax
+       movq    64(%rsi),%r12
+       movq    %r8,%r13
+       movq    %rax,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r9,%r15
+
+       xorq    %r8,%r13
+       rorq    $5,%r14
+       xorq    %r10,%r15
+
+       movq    %r12,64(%rsp)
+       xorq    %rax,%r14
+       andq    %r8,%r15
+
+       rorq    $4,%r13
+       addq    %r11,%r12
+       xorq    %r10,%r15
+
+       rorq    $6,%r14
+       xorq    %r8,%r13
+       addq    %r15,%r12
+
+       movq    %rax,%r15
+       addq    (%rbp),%r12
+       xorq    %rax,%r14
+
+       xorq    %rbx,%r15
+       rorq    $14,%r13
+       movq    %rbx,%r11
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r11
+       addq    %r12,%rdx
+       addq    %r12,%r11
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%r11
+       movq    72(%rsi),%r12
+       movq    %rdx,%r13
+       movq    %r11,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r8,%rdi
+
+       xorq    %rdx,%r13
+       rorq    $5,%r14
+       xorq    %r9,%rdi
+
+       movq    %r12,72(%rsp)
+       xorq    %r11,%r14
+       andq    %rdx,%rdi
+
+       rorq    $4,%r13
+       addq    %r10,%r12
+       xorq    %r9,%rdi
+
+       rorq    $6,%r14
+       xorq    %rdx,%r13
+       addq    %rdi,%r12
+
+       movq    %r11,%rdi
+       addq    (%rbp),%r12
+       xorq    %r11,%r14
+
+       xorq    %rax,%rdi
+       rorq    $14,%r13
+       movq    %rax,%r10
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r10
+       addq    %r12,%rcx
+       addq    %r12,%r10
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%r10
+       movq    80(%rsi),%r12
+       movq    %rcx,%r13
+       movq    %r10,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rdx,%r15
+
+       xorq    %rcx,%r13
+       rorq    $5,%r14
+       xorq    %r8,%r15
+
+       movq    %r12,80(%rsp)
+       xorq    %r10,%r14
+       andq    %rcx,%r15
+
+       rorq    $4,%r13
+       addq    %r9,%r12
+       xorq    %r8,%r15
+
+       rorq    $6,%r14
+       xorq    %rcx,%r13
+       addq    %r15,%r12
+
+       movq    %r10,%r15
+       addq    (%rbp),%r12
+       xorq    %r10,%r14
+
+       xorq    %r11,%r15
+       rorq    $14,%r13
+       movq    %r11,%r9
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r9
+       addq    %r12,%rbx
+       addq    %r12,%r9
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%r9
+       movq    88(%rsi),%r12
+       movq    %rbx,%r13
+       movq    %r9,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rcx,%rdi
+
+       xorq    %rbx,%r13
+       rorq    $5,%r14
+       xorq    %rdx,%rdi
+
+       movq    %r12,88(%rsp)
+       xorq    %r9,%r14
+       andq    %rbx,%rdi
+
+       rorq    $4,%r13
+       addq    %r8,%r12
+       xorq    %rdx,%rdi
+
+       rorq    $6,%r14
+       xorq    %rbx,%r13
+       addq    %rdi,%r12
+
+       movq    %r9,%rdi
+       addq    (%rbp),%r12
+       xorq    %r9,%r14
+
+       xorq    %r10,%rdi
+       rorq    $14,%r13
+       movq    %r10,%r8
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r8
+       addq    %r12,%rax
+       addq    %r12,%r8
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%r8
+       movq    96(%rsi),%r12
+       movq    %rax,%r13
+       movq    %r8,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rbx,%r15
+
+       xorq    %rax,%r13
+       rorq    $5,%r14
+       xorq    %rcx,%r15
+
+       movq    %r12,96(%rsp)
+       xorq    %r8,%r14
+       andq    %rax,%r15
+
+       rorq    $4,%r13
+       addq    %rdx,%r12
+       xorq    %rcx,%r15
+
+       rorq    $6,%r14
+       xorq    %rax,%r13
+       addq    %r15,%r12
+
+       movq    %r8,%r15
+       addq    (%rbp),%r12
+       xorq    %r8,%r14
+
+       xorq    %r9,%r15
+       rorq    $14,%r13
+       movq    %r9,%rdx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rdx
+       addq    %r12,%r11
+       addq    %r12,%rdx
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%rdx
+       movq    104(%rsi),%r12
+       movq    %r11,%r13
+       movq    %rdx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rax,%rdi
+
+       xorq    %r11,%r13
+       rorq    $5,%r14
+       xorq    %rbx,%rdi
+
+       movq    %r12,104(%rsp)
+       xorq    %rdx,%r14
+       andq    %r11,%rdi
+
+       rorq    $4,%r13
+       addq    %rcx,%r12
+       xorq    %rbx,%rdi
+
+       rorq    $6,%r14
+       xorq    %r11,%r13
+       addq    %rdi,%r12
+
+       movq    %rdx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rdx,%r14
+
+       xorq    %r8,%rdi
+       rorq    $14,%r13
+       movq    %r8,%rcx
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rcx
+       addq    %r12,%r10
+       addq    %r12,%rcx
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%rcx
+       movq    112(%rsi),%r12
+       movq    %r10,%r13
+       movq    %rcx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r11,%r15
+
+       xorq    %r10,%r13
+       rorq    $5,%r14
+       xorq    %rax,%r15
+
+       movq    %r12,112(%rsp)
+       xorq    %rcx,%r14
+       andq    %r10,%r15
+
+       rorq    $4,%r13
+       addq    %rbx,%r12
+       xorq    %rax,%r15
+
+       rorq    $6,%r14
+       xorq    %r10,%r13
+       addq    %r15,%r12
+
+       movq    %rcx,%r15
+       addq    (%rbp),%r12
+       xorq    %rcx,%r14
+
+       xorq    %rdx,%r15
+       rorq    $14,%r13
+       movq    %rdx,%rbx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rbx
+       addq    %r12,%r9
+       addq    %r12,%rbx
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%rbx
+       movq    120(%rsi),%r12
+       movq    %r9,%r13
+       movq    %rbx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r10,%rdi
+
+       xorq    %r9,%r13
+       rorq    $5,%r14
+       xorq    %r11,%rdi
+
+       movq    %r12,120(%rsp)
+       xorq    %rbx,%r14
+       andq    %r9,%rdi
+
+       rorq    $4,%r13
+       addq    %rax,%r12
+       xorq    %r11,%rdi
+
+       rorq    $6,%r14
+       xorq    %r9,%r13
+       addq    %rdi,%r12
+
+       movq    %rbx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rbx,%r14
+
+       xorq    %rcx,%rdi
+       rorq    $14,%r13
+       movq    %rcx,%rax
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rax
+       addq    %r12,%r8
+       addq    %r12,%rax
+
+       leaq    24(%rbp),%rbp
        jmp     .Lrounds_16_xx
 .align 16
 .Lrounds_16_xx:
-
-       movl    56(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    36(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    0(%rsp),%r12d
-       movl    %r8d,%r13d
-       addl    %r14d,%r12d
-       movl    %eax,%r14d
-       rorl    $14,%r13d
-       movl    %r9d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r8d,%r13d
-       xorl    %r10d,%r15d
-
-       movl    %r12d,0(%rsp)
-       xorl    %eax,%r14d
-       andl    %r8d,%r15d
-
-       rorl    $5,%r13d
-       addl    %r11d,%r12d
-       xorl    %r10d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r8d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %eax,%r15d
-       addl    (%rbp),%r12d
-       xorl    %eax,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ebx,%r15d
-       movl    %ebx,%r11d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r11d
-       addl    %r12d,%edx
-       addl    %r12d,%r11d
-       movl    8(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r11d
-
-
-       movl    60(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    40(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    4(%rsp),%r12d
-       movl    %edx,%r13d
-       addl    %r14d,%r12d
-       movl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r8d,%edi
-
-       rorl    $9,%r14d
-       xorl    %edx,%r13d
-       xorl    %r9d,%edi
-
-       movl    %r12d,4(%rsp)
-       xorl    %r11d,%r14d
-       andl    %edx,%edi
-
-       rorl    $5,%r13d
-       addl    %r10d,%r12d
-       xorl    %r9d,%edi
-
-       rorl    $11,%r14d
-       xorl    %edx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r11d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r11d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %eax,%edi
-       movl    %eax,%r10d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r10d
-       addl    %r12d,%ecx
-       addl    %r12d,%r10d
-       movl    12(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r10d
-
-
-       movl    0(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    44(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    8(%rsp),%r12d
-       movl    %ecx,%r13d
-       addl    %r14d,%r12d
-       movl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %edx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %ecx,%r13d
-       xorl    %r8d,%r15d
-
-       movl    %r12d,8(%rsp)
-       xorl    %r10d,%r14d
-       andl    %ecx,%r15d
-
-       rorl    $5,%r13d
-       addl    %r9d,%r12d
-       xorl    %r8d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %ecx,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r10d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r10d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r11d,%r15d
-       movl    %r11d,%r9d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r9d
-       addl    %r12d,%ebx
-       addl    %r12d,%r9d
-       movl    16(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r9d
-
-
-       movl    4(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    48(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    12(%rsp),%r12d
-       movl    %ebx,%r13d
-       addl    %r14d,%r12d
-       movl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %ecx,%edi
-
-       rorl    $9,%r14d
-       xorl    %ebx,%r13d
-       xorl    %edx,%edi
-
-       movl    %r12d,12(%rsp)
-       xorl    %r9d,%r14d
-       andl    %ebx,%edi
-
-       rorl    $5,%r13d
-       addl    %r8d,%r12d
-       xorl    %edx,%edi
-
-       rorl    $11,%r14d
-       xorl    %ebx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r9d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r9d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r10d,%edi
-       movl    %r10d,%r8d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r8d
-       addl    %r12d,%eax
-       addl    %r12d,%r8d
-       movl    20(%rsp),%r13d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%r8d
-
-
-       movl    8(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    52(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    16(%rsp),%r12d
-       movl    %eax,%r13d
-       addl    %r14d,%r12d
-       movl    %r8d,%r14d
-       rorl    $14,%r13d
-       movl    %ebx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %eax,%r13d
-       xorl    %ecx,%r15d
-
-       movl    %r12d,16(%rsp)
-       xorl    %r8d,%r14d
-       andl    %eax,%r15d
-
-       rorl    $5,%r13d
-       addl    %edx,%r12d
-       xorl    %ecx,%r15d
-
-       rorl    $11,%r14d
-       xorl    %eax,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r8d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r8d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r9d,%r15d
-       movl    %r9d,%edx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%edx
-       addl    %r12d,%r11d
-       addl    %r12d,%edx
-       movl    24(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%edx
-
-
-       movl    12(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    56(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    20(%rsp),%r12d
-       movl    %r11d,%r13d
-       addl    %r14d,%r12d
-       movl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %eax,%edi
-
-       rorl    $9,%r14d
-       xorl    %r11d,%r13d
-       xorl    %ebx,%edi
-
-       movl    %r12d,20(%rsp)
-       xorl    %edx,%r14d
-       andl    %r11d,%edi
-
-       rorl    $5,%r13d
-       addl    %ecx,%r12d
-       xorl    %ebx,%edi
-
-       rorl    $11,%r14d
-       xorl    %r11d,%r13d
-       addl    %edi,%r12d
-
-       movl    %edx,%edi
-       addl    (%rbp),%r12d
-       xorl    %edx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r8d,%edi
-       movl    %r8d,%ecx
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%ecx
-       addl    %r12d,%r10d
-       addl    %r12d,%ecx
-       movl    28(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ecx
-
-
-       movl    16(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    60(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    24(%rsp),%r12d
-       movl    %r10d,%r13d
-       addl    %r14d,%r12d
-       movl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r11d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r10d,%r13d
-       xorl    %eax,%r15d
-
-       movl    %r12d,24(%rsp)
-       xorl    %ecx,%r14d
-       andl    %r10d,%r15d
-
-       rorl    $5,%r13d
-       addl    %ebx,%r12d
-       xorl    %eax,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r10d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %ecx,%r15d
-       addl    (%rbp),%r12d
-       xorl    %ecx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %edx,%r15d
-       movl    %edx,%ebx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%ebx
-       addl    %r12d,%r9d
-       addl    %r12d,%ebx
-       movl    32(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ebx
-
-
-       movl    20(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    0(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    28(%rsp),%r12d
-       movl    %r9d,%r13d
-       addl    %r14d,%r12d
-       movl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r10d,%edi
-
-       rorl    $9,%r14d
-       xorl    %r9d,%r13d
-       xorl    %r11d,%edi
-
-       movl    %r12d,28(%rsp)
-       xorl    %ebx,%r14d
-       andl    %r9d,%edi
-
-       rorl    $5,%r13d
-       addl    %eax,%r12d
-       xorl    %r11d,%edi
-
-       rorl    $11,%r14d
-       xorl    %r9d,%r13d
-       addl    %edi,%r12d
-
-       movl    %ebx,%edi
-       addl    (%rbp),%r12d
-       xorl    %ebx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ecx,%edi
-       movl    %ecx,%eax
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%eax
-       addl    %r12d,%r8d
-       addl    %r12d,%eax
-       movl    36(%rsp),%r13d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%eax
-
-
-       movl    24(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    4(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    32(%rsp),%r12d
-       movl    %r8d,%r13d
-       addl    %r14d,%r12d
-       movl    %eax,%r14d
-       rorl    $14,%r13d
-       movl    %r9d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r8d,%r13d
-       xorl    %r10d,%r15d
-
-       movl    %r12d,32(%rsp)
-       xorl    %eax,%r14d
-       andl    %r8d,%r15d
-
-       rorl    $5,%r13d
-       addl    %r11d,%r12d
-       xorl    %r10d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r8d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %eax,%r15d
-       addl    (%rbp),%r12d
-       xorl    %eax,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ebx,%r15d
-       movl    %ebx,%r11d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r11d
-       addl    %r12d,%edx
-       addl    %r12d,%r11d
-       movl    40(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r11d
-
-
-       movl    28(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    8(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    36(%rsp),%r12d
-       movl    %edx,%r13d
-       addl    %r14d,%r12d
-       movl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r8d,%edi
-
-       rorl    $9,%r14d
-       xorl    %edx,%r13d
-       xorl    %r9d,%edi
-
-       movl    %r12d,36(%rsp)
-       xorl    %r11d,%r14d
-       andl    %edx,%edi
-
-       rorl    $5,%r13d
-       addl    %r10d,%r12d
-       xorl    %r9d,%edi
-
-       rorl    $11,%r14d
-       xorl    %edx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r11d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r11d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %eax,%edi
-       movl    %eax,%r10d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r10d
-       addl    %r12d,%ecx
-       addl    %r12d,%r10d
-       movl    44(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r10d
-
-
-       movl    32(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    12(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    40(%rsp),%r12d
-       movl    %ecx,%r13d
-       addl    %r14d,%r12d
-       movl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %edx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %ecx,%r13d
-       xorl    %r8d,%r15d
-
-       movl    %r12d,40(%rsp)
-       xorl    %r10d,%r14d
-       andl    %ecx,%r15d
-
-       rorl    $5,%r13d
-       addl    %r9d,%r12d
-       xorl    %r8d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %ecx,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r10d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r10d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r11d,%r15d
-       movl    %r11d,%r9d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r9d
-       addl    %r12d,%ebx
-       addl    %r12d,%r9d
-       movl    48(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r9d
-
-
-       movl    36(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    16(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    44(%rsp),%r12d
-       movl    %ebx,%r13d
-       addl    %r14d,%r12d
-       movl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %ecx,%edi
-
-       rorl    $9,%r14d
-       xorl    %ebx,%r13d
-       xorl    %edx,%edi
-
-       movl    %r12d,44(%rsp)
-       xorl    %r9d,%r14d
-       andl    %ebx,%edi
-
-       rorl    $5,%r13d
-       addl    %r8d,%r12d
-       xorl    %edx,%edi
-
-       rorl    $11,%r14d
-       xorl    %ebx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r9d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r9d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r10d,%edi
-       movl    %r10d,%r8d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r8d
-       addl    %r12d,%eax
-       addl    %r12d,%r8d
-       movl    52(%rsp),%r13d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%r8d
-
-
-       movl    40(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    20(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    48(%rsp),%r12d
-       movl    %eax,%r13d
-       addl    %r14d,%r12d
-       movl    %r8d,%r14d
-       rorl    $14,%r13d
-       movl    %ebx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %eax,%r13d
-       xorl    %ecx,%r15d
-
-       movl    %r12d,48(%rsp)
-       xorl    %r8d,%r14d
-       andl    %eax,%r15d
-
-       rorl    $5,%r13d
-       addl    %edx,%r12d
-       xorl    %ecx,%r15d
-
-       rorl    $11,%r14d
-       xorl    %eax,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r8d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r8d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r9d,%r15d
-       movl    %r9d,%edx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%edx
-       addl    %r12d,%r11d
-       addl    %r12d,%edx
-       movl    56(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%edx
-
-
-       movl    44(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    24(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    52(%rsp),%r12d
-       movl    %r11d,%r13d
-       addl    %r14d,%r12d
-       movl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %eax,%edi
-
-       rorl    $9,%r14d
-       xorl    %r11d,%r13d
-       xorl    %ebx,%edi
-
-       movl    %r12d,52(%rsp)
-       xorl    %edx,%r14d
-       andl    %r11d,%edi
-
-       rorl    $5,%r13d
-       addl    %ecx,%r12d
-       xorl    %ebx,%edi
-
-       rorl    $11,%r14d
-       xorl    %r11d,%r13d
-       addl    %edi,%r12d
-
-       movl    %edx,%edi
-       addl    (%rbp),%r12d
-       xorl    %edx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r8d,%edi
-       movl    %r8d,%ecx
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%ecx
-       addl    %r12d,%r10d
-       addl    %r12d,%ecx
-       movl    60(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ecx
-
-
-       movl    48(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    28(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    56(%rsp),%r12d
-       movl    %r10d,%r13d
-       addl    %r14d,%r12d
-       movl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r11d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r10d,%r13d
-       xorl    %eax,%r15d
-
-       movl    %r12d,56(%rsp)
-       xorl    %ecx,%r14d
-       andl    %r10d,%r15d
-
-       rorl    $5,%r13d
-       addl    %ebx,%r12d
-       xorl    %eax,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r10d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %ecx,%r15d
-       addl    (%rbp),%r12d
-       xorl    %ecx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %edx,%r15d
-       movl    %edx,%ebx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%ebx
-       addl    %r12d,%r9d
-       addl    %r12d,%ebx
-       movl    0(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ebx
-
-
-       movl    52(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    32(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    60(%rsp),%r12d
-       movl    %r9d,%r13d
-       addl    %r14d,%r12d
-       movl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r10d,%edi
-
-       rorl    $9,%r14d
-       xorl    %r9d,%r13d
-       xorl    %r11d,%edi
-
-       movl    %r12d,60(%rsp)
-       xorl    %ebx,%r14d
-       andl    %r9d,%edi
-
-       rorl    $5,%r13d
-       addl    %eax,%r12d
-       xorl    %r11d,%edi
-
-       rorl    $11,%r14d
-       xorl    %r9d,%r13d
-       addl    %edi,%r12d
-
-       movl    %ebx,%edi
-       addl    (%rbp),%r12d
-       xorl    %ebx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ecx,%edi
-       movl    %ecx,%eax
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%eax
-       addl    %r12d,%r8d
-       addl    %r12d,%eax
-       movl    4(%rsp),%r13d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%eax
-
-       cmpb    $0,3(%rbp)
+       movq    8(%rsp),%r13
+       movq    112(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rax
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    72(%rsp),%r12
+
+       addq    0(%rsp),%r12
+       movq    %r8,%r13
+       addq    %r15,%r12
+       movq    %rax,%r14
+       rorq    $23,%r13
+       movq    %r9,%r15
+
+       xorq    %r8,%r13
+       rorq    $5,%r14
+       xorq    %r10,%r15
+
+       movq    %r12,0(%rsp)
+       xorq    %rax,%r14
+       andq    %r8,%r15
+
+       rorq    $4,%r13
+       addq    %r11,%r12
+       xorq    %r10,%r15
+
+       rorq    $6,%r14
+       xorq    %r8,%r13
+       addq    %r15,%r12
+
+       movq    %rax,%r15
+       addq    (%rbp),%r12
+       xorq    %rax,%r14
+
+       xorq    %rbx,%r15
+       rorq    $14,%r13
+       movq    %rbx,%r11
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r11
+       addq    %r12,%rdx
+       addq    %r12,%r11
+
+       leaq    8(%rbp),%rbp
+       movq    16(%rsp),%r13
+       movq    120(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r11
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    80(%rsp),%r12
+
+       addq    8(%rsp),%r12
+       movq    %rdx,%r13
+       addq    %rdi,%r12
+       movq    %r11,%r14
+       rorq    $23,%r13
+       movq    %r8,%rdi
+
+       xorq    %rdx,%r13
+       rorq    $5,%r14
+       xorq    %r9,%rdi
+
+       movq    %r12,8(%rsp)
+       xorq    %r11,%r14
+       andq    %rdx,%rdi
+
+       rorq    $4,%r13
+       addq    %r10,%r12
+       xorq    %r9,%rdi
+
+       rorq    $6,%r14
+       xorq    %rdx,%r13
+       addq    %rdi,%r12
+
+       movq    %r11,%rdi
+       addq    (%rbp),%r12
+       xorq    %r11,%r14
+
+       xorq    %rax,%rdi
+       rorq    $14,%r13
+       movq    %rax,%r10
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r10
+       addq    %r12,%rcx
+       addq    %r12,%r10
+
+       leaq    24(%rbp),%rbp
+       movq    24(%rsp),%r13
+       movq    0(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r10
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    88(%rsp),%r12
+
+       addq    16(%rsp),%r12
+       movq    %rcx,%r13
+       addq    %r15,%r12
+       movq    %r10,%r14
+       rorq    $23,%r13
+       movq    %rdx,%r15
+
+       xorq    %rcx,%r13
+       rorq    $5,%r14
+       xorq    %r8,%r15
+
+       movq    %r12,16(%rsp)
+       xorq    %r10,%r14
+       andq    %rcx,%r15
+
+       rorq    $4,%r13
+       addq    %r9,%r12
+       xorq    %r8,%r15
+
+       rorq    $6,%r14
+       xorq    %rcx,%r13
+       addq    %r15,%r12
+
+       movq    %r10,%r15
+       addq    (%rbp),%r12
+       xorq    %r10,%r14
+
+       xorq    %r11,%r15
+       rorq    $14,%r13
+       movq    %r11,%r9
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r9
+       addq    %r12,%rbx
+       addq    %r12,%r9
+
+       leaq    8(%rbp),%rbp
+       movq    32(%rsp),%r13
+       movq    8(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r9
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    96(%rsp),%r12
+
+       addq    24(%rsp),%r12
+       movq    %rbx,%r13
+       addq    %rdi,%r12
+       movq    %r9,%r14
+       rorq    $23,%r13
+       movq    %rcx,%rdi
+
+       xorq    %rbx,%r13
+       rorq    $5,%r14
+       xorq    %rdx,%rdi
+
+       movq    %r12,24(%rsp)
+       xorq    %r9,%r14
+       andq    %rbx,%rdi
+
+       rorq    $4,%r13
+       addq    %r8,%r12
+       xorq    %rdx,%rdi
+
+       rorq    $6,%r14
+       xorq    %rbx,%r13
+       addq    %rdi,%r12
+
+       movq    %r9,%rdi
+       addq    (%rbp),%r12
+       xorq    %r9,%r14
+
+       xorq    %r10,%rdi
+       rorq    $14,%r13
+       movq    %r10,%r8
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r8
+       addq    %r12,%rax
+       addq    %r12,%r8
+
+       leaq    24(%rbp),%rbp
+       movq    40(%rsp),%r13
+       movq    16(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r8
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    104(%rsp),%r12
+
+       addq    32(%rsp),%r12
+       movq    %rax,%r13
+       addq    %r15,%r12
+       movq    %r8,%r14
+       rorq    $23,%r13
+       movq    %rbx,%r15
+
+       xorq    %rax,%r13
+       rorq    $5,%r14
+       xorq    %rcx,%r15
+
+       movq    %r12,32(%rsp)
+       xorq    %r8,%r14
+       andq    %rax,%r15
+
+       rorq    $4,%r13
+       addq    %rdx,%r12
+       xorq    %rcx,%r15
+
+       rorq    $6,%r14
+       xorq    %rax,%r13
+       addq    %r15,%r12
+
+       movq    %r8,%r15
+       addq    (%rbp),%r12
+       xorq    %r8,%r14
+
+       xorq    %r9,%r15
+       rorq    $14,%r13
+       movq    %r9,%rdx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rdx
+       addq    %r12,%r11
+       addq    %r12,%rdx
+
+       leaq    8(%rbp),%rbp
+       movq    48(%rsp),%r13
+       movq    24(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rdx
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    112(%rsp),%r12
+
+       addq    40(%rsp),%r12
+       movq    %r11,%r13
+       addq    %rdi,%r12
+       movq    %rdx,%r14
+       rorq    $23,%r13
+       movq    %rax,%rdi
+
+       xorq    %r11,%r13
+       rorq    $5,%r14
+       xorq    %rbx,%rdi
+
+       movq    %r12,40(%rsp)
+       xorq    %rdx,%r14
+       andq    %r11,%rdi
+
+       rorq    $4,%r13
+       addq    %rcx,%r12
+       xorq    %rbx,%rdi
+
+       rorq    $6,%r14
+       xorq    %r11,%r13
+       addq    %rdi,%r12
+
+       movq    %rdx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rdx,%r14
+
+       xorq    %r8,%rdi
+       rorq    $14,%r13
+       movq    %r8,%rcx
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rcx
+       addq    %r12,%r10
+       addq    %r12,%rcx
+
+       leaq    24(%rbp),%rbp
+       movq    56(%rsp),%r13
+       movq    32(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rcx
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    120(%rsp),%r12
+
+       addq    48(%rsp),%r12
+       movq    %r10,%r13
+       addq    %r15,%r12
+       movq    %rcx,%r14
+       rorq    $23,%r13
+       movq    %r11,%r15
+
+       xorq    %r10,%r13
+       rorq    $5,%r14
+       xorq    %rax,%r15
+
+       movq    %r12,48(%rsp)
+       xorq    %rcx,%r14
+       andq    %r10,%r15
+
+       rorq    $4,%r13
+       addq    %rbx,%r12
+       xorq    %rax,%r15
+
+       rorq    $6,%r14
+       xorq    %r10,%r13
+       addq    %r15,%r12
+
+       movq    %rcx,%r15
+       addq    (%rbp),%r12
+       xorq    %rcx,%r14
+
+       xorq    %rdx,%r15
+       rorq    $14,%r13
+       movq    %rdx,%rbx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rbx
+       addq    %r12,%r9
+       addq    %r12,%rbx
+
+       leaq    8(%rbp),%rbp
+       movq    64(%rsp),%r13
+       movq    40(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rbx
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    0(%rsp),%r12
+
+       addq    56(%rsp),%r12
+       movq    %r9,%r13
+       addq    %rdi,%r12
+       movq    %rbx,%r14
+       rorq    $23,%r13
+       movq    %r10,%rdi
+
+       xorq    %r9,%r13
+       rorq    $5,%r14
+       xorq    %r11,%rdi
+
+       movq    %r12,56(%rsp)
+       xorq    %rbx,%r14
+       andq    %r9,%rdi
+
+       rorq    $4,%r13
+       addq    %rax,%r12
+       xorq    %r11,%rdi
+
+       rorq    $6,%r14
+       xorq    %r9,%r13
+       addq    %rdi,%r12
+
+       movq    %rbx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rbx,%r14
+
+       xorq    %rcx,%rdi
+       rorq    $14,%r13
+       movq    %rcx,%rax
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rax
+       addq    %r12,%r8
+       addq    %r12,%rax
+
+       leaq    24(%rbp),%rbp
+       movq    72(%rsp),%r13
+       movq    48(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rax
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    8(%rsp),%r12
+
+       addq    64(%rsp),%r12
+       movq    %r8,%r13
+       addq    %r15,%r12
+       movq    %rax,%r14
+       rorq    $23,%r13
+       movq    %r9,%r15
+
+       xorq    %r8,%r13
+       rorq    $5,%r14
+       xorq    %r10,%r15
+
+       movq    %r12,64(%rsp)
+       xorq    %rax,%r14
+       andq    %r8,%r15
+
+       rorq    $4,%r13
+       addq    %r11,%r12
+       xorq    %r10,%r15
+
+       rorq    $6,%r14
+       xorq    %r8,%r13
+       addq    %r15,%r12
+
+       movq    %rax,%r15
+       addq    (%rbp),%r12
+       xorq    %rax,%r14
+
+       xorq    %rbx,%r15
+       rorq    $14,%r13
+       movq    %rbx,%r11
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r11
+       addq    %r12,%rdx
+       addq    %r12,%r11
+
+       leaq    8(%rbp),%rbp
+       movq    80(%rsp),%r13
+       movq    56(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r11
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    16(%rsp),%r12
+
+       addq    72(%rsp),%r12
+       movq    %rdx,%r13
+       addq    %rdi,%r12
+       movq    %r11,%r14
+       rorq    $23,%r13
+       movq    %r8,%rdi
+
+       xorq    %rdx,%r13
+       rorq    $5,%r14
+       xorq    %r9,%rdi
+
+       movq    %r12,72(%rsp)
+       xorq    %r11,%r14
+       andq    %rdx,%rdi
+
+       rorq    $4,%r13
+       addq    %r10,%r12
+       xorq    %r9,%rdi
+
+       rorq    $6,%r14
+       xorq    %rdx,%r13
+       addq    %rdi,%r12
+
+       movq    %r11,%rdi
+       addq    (%rbp),%r12
+       xorq    %r11,%r14
+
+       xorq    %rax,%rdi
+       rorq    $14,%r13
+       movq    %rax,%r10
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r10
+       addq    %r12,%rcx
+       addq    %r12,%r10
+
+       leaq    24(%rbp),%rbp
+       movq    88(%rsp),%r13
+       movq    64(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r10
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    24(%rsp),%r12
+
+       addq    80(%rsp),%r12
+       movq    %rcx,%r13
+       addq    %r15,%r12
+       movq    %r10,%r14
+       rorq    $23,%r13
+       movq    %rdx,%r15
+
+       xorq    %rcx,%r13
+       rorq    $5,%r14
+       xorq    %r8,%r15
+
+       movq    %r12,80(%rsp)
+       xorq    %r10,%r14
+       andq    %rcx,%r15
+
+       rorq    $4,%r13
+       addq    %r9,%r12
+       xorq    %r8,%r15
+
+       rorq    $6,%r14
+       xorq    %rcx,%r13
+       addq    %r15,%r12
+
+       movq    %r10,%r15
+       addq    (%rbp),%r12
+       xorq    %r10,%r14
+
+       xorq    %r11,%r15
+       rorq    $14,%r13
+       movq    %r11,%r9
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r9
+       addq    %r12,%rbx
+       addq    %r12,%r9
+
+       leaq    8(%rbp),%rbp
+       movq    96(%rsp),%r13
+       movq    72(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r9
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    32(%rsp),%r12
+
+       addq    88(%rsp),%r12
+       movq    %rbx,%r13
+       addq    %rdi,%r12
+       movq    %r9,%r14
+       rorq    $23,%r13
+       movq    %rcx,%rdi
+
+       xorq    %rbx,%r13
+       rorq    $5,%r14
+       xorq    %rdx,%rdi
+
+       movq    %r12,88(%rsp)
+       xorq    %r9,%r14
+       andq    %rbx,%rdi
+
+       rorq    $4,%r13
+       addq    %r8,%r12
+       xorq    %rdx,%rdi
+
+       rorq    $6,%r14
+       xorq    %rbx,%r13
+       addq    %rdi,%r12
+
+       movq    %r9,%rdi
+       addq    (%rbp),%r12
+       xorq    %r9,%r14
+
+       xorq    %r10,%rdi
+       rorq    $14,%r13
+       movq    %r10,%r8
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r8
+       addq    %r12,%rax
+       addq    %r12,%r8
+
+       leaq    24(%rbp),%rbp
+       movq    104(%rsp),%r13
+       movq    80(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r8
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    40(%rsp),%r12
+
+       addq    96(%rsp),%r12
+       movq    %rax,%r13
+       addq    %r15,%r12
+       movq    %r8,%r14
+       rorq    $23,%r13
+       movq    %rbx,%r15
+
+       xorq    %rax,%r13
+       rorq    $5,%r14
+       xorq    %rcx,%r15
+
+       movq    %r12,96(%rsp)
+       xorq    %r8,%r14
+       andq    %rax,%r15
+
+       rorq    $4,%r13
+       addq    %rdx,%r12
+       xorq    %rcx,%r15
+
+       rorq    $6,%r14
+       xorq    %rax,%r13
+       addq    %r15,%r12
+
+       movq    %r8,%r15
+       addq    (%rbp),%r12
+       xorq    %r8,%r14
+
+       xorq    %r9,%r15
+       rorq    $14,%r13
+       movq    %r9,%rdx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rdx
+       addq    %r12,%r11
+       addq    %r12,%rdx
+
+       leaq    8(%rbp),%rbp
+       movq    112(%rsp),%r13
+       movq    88(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rdx
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    48(%rsp),%r12
+
+       addq    104(%rsp),%r12
+       movq    %r11,%r13
+       addq    %rdi,%r12
+       movq    %rdx,%r14
+       rorq    $23,%r13
+       movq    %rax,%rdi
+
+       xorq    %r11,%r13
+       rorq    $5,%r14
+       xorq    %rbx,%rdi
+
+       movq    %r12,104(%rsp)
+       xorq    %rdx,%r14
+       andq    %r11,%rdi
+
+       rorq    $4,%r13
+       addq    %rcx,%r12
+       xorq    %rbx,%rdi
+
+       rorq    $6,%r14
+       xorq    %r11,%r13
+       addq    %rdi,%r12
+
+       movq    %rdx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rdx,%r14
+
+       xorq    %r8,%rdi
+       rorq    $14,%r13
+       movq    %r8,%rcx
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rcx
+       addq    %r12,%r10
+       addq    %r12,%rcx
+
+       leaq    24(%rbp),%rbp
+       movq    120(%rsp),%r13
+       movq    96(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rcx
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    56(%rsp),%r12
+
+       addq    112(%rsp),%r12
+       movq    %r10,%r13
+       addq    %r15,%r12
+       movq    %rcx,%r14
+       rorq    $23,%r13
+       movq    %r11,%r15
+
+       xorq    %r10,%r13
+       rorq    $5,%r14
+       xorq    %rax,%r15
+
+       movq    %r12,112(%rsp)
+       xorq    %rcx,%r14
+       andq    %r10,%r15
+
+       rorq    $4,%r13
+       addq    %rbx,%r12
+       xorq    %rax,%r15
+
+       rorq    $6,%r14
+       xorq    %r10,%r13
+       addq    %r15,%r12
+
+       movq    %rcx,%r15
+       addq    (%rbp),%r12
+       xorq    %rcx,%r14
+
+       xorq    %rdx,%r15
+       rorq    $14,%r13
+       movq    %rdx,%rbx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rbx
+       addq    %r12,%r9
+       addq    %r12,%rbx
+
+       leaq    8(%rbp),%rbp
+       movq    0(%rsp),%r13
+       movq    104(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rbx
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    64(%rsp),%r12
+
+       addq    120(%rsp),%r12
+       movq    %r9,%r13
+       addq    %rdi,%r12
+       movq    %rbx,%r14
+       rorq    $23,%r13
+       movq    %r10,%rdi
+
+       xorq    %r9,%r13
+       rorq    $5,%r14
+       xorq    %r11,%rdi
+
+       movq    %r12,120(%rsp)
+       xorq    %rbx,%r14
+       andq    %r9,%rdi
+
+       rorq    $4,%r13
+       addq    %rax,%r12
+       xorq    %r11,%rdi
+
+       rorq    $6,%r14
+       xorq    %r9,%r13
+       addq    %rdi,%r12
+
+       movq    %rbx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rbx,%r14
+
+       xorq    %rcx,%rdi
+       rorq    $14,%r13
+       movq    %rcx,%rax
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rax
+       addq    %r12,%r8
+       addq    %r12,%rax
+
+       leaq    24(%rbp),%rbp
+       cmpb    $0,7(%rbp)
        jnz     .Lrounds_16_xx
 
-       movq    64+0(%rsp),%rdi
-       leaq    64(%rsi),%rsi
-
-       addl    0(%rdi),%eax
-       addl    4(%rdi),%ebx
-       addl    8(%rdi),%ecx
-       addl    12(%rdi),%edx
-       addl    16(%rdi),%r8d
-       addl    20(%rdi),%r9d
-       addl    24(%rdi),%r10d
-       addl    28(%rdi),%r11d
-
-       cmpq    64+16(%rsp),%rsi
-
-       movl    %eax,0(%rdi)
-       movl    %ebx,4(%rdi)
-       movl    %ecx,8(%rdi)
-       movl    %edx,12(%rdi)
-       movl    %r8d,16(%rdi)
-       movl    %r9d,20(%rdi)
-       movl    %r10d,24(%rdi)
-       movl    %r11d,28(%rdi)
+       movq    128+0(%rsp),%rdi
+       addq    %r14,%rax
+       leaq    128(%rsi),%rsi
+
+       addq    0(%rdi),%rax
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       addq    48(%rdi),%r10
+       addq    56(%rdi),%r11
+
+       cmpq    128+16(%rsp),%rsi
+
+       movq    %rax,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
        jb      .Lloop
 
-       movq    64+24(%rsp),%rsi
-       movq    (%rsi),%r15
-       movq    8(%rsi),%r14
-       movq    16(%rsi),%r13
-       movq    24(%rsi),%r12
-       movq    32(%rsi),%rbp
-       movq    40(%rsi),%rbx
-       leaq    48(%rsi),%rsp
+       movq    152(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       movq    -48(%rsi),%r15
+.cfi_restore   %r15
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lepilogue:
        .byte   0xf3,0xc3
-.size  sha256_block_data_order,.-sha256_block_data_order
+.cfi_endproc   
+.size  sha512_block_data_order,.-sha512_block_data_order
 .align 64
-.type  K256,@object
-K256:
-.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
-.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
-.long  0x03020100,0x0b0a0908,0xffffffff,0xffffffff
-.long  0x03020100,0x0b0a0908,0xffffffff,0xffffffff
-.long  0xffffffff,0xffffffff,0x03020100,0x0b0a0908
-.long  0xffffffff,0xffffffff,0x03020100,0x0b0a0908
-.byte  83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.type  sha256_block_data_order_ssse3,@function
+.type  K512,@object
+K512:
+.quad  0x428a2f98d728ae22,0x7137449123ef65cd
+.quad  0x428a2f98d728ae22,0x7137449123ef65cd
+.quad  0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad  0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad  0x3956c25bf348b538,0x59f111f1b605d019
+.quad  0x3956c25bf348b538,0x59f111f1b605d019
+.quad  0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad  0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad  0xd807aa98a3030242,0x12835b0145706fbe
+.quad  0xd807aa98a3030242,0x12835b0145706fbe
+.quad  0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad  0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad  0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad  0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad  0x9bdc06a725c71235,0xc19bf174cf692694
+.quad  0x9bdc06a725c71235,0xc19bf174cf692694
+.quad  0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad  0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad  0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad  0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad  0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad  0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad  0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad  0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad  0x983e5152ee66dfab,0xa831c66d2db43210
+.quad  0x983e5152ee66dfab,0xa831c66d2db43210
+.quad  0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad  0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad  0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad  0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad  0x06ca6351e003826f,0x142929670a0e6e70
+.quad  0x06ca6351e003826f,0x142929670a0e6e70
+.quad  0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad  0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad  0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad  0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad  0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad  0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad  0x81c2c92e47edaee6,0x92722c851482353b
+.quad  0x81c2c92e47edaee6,0x92722c851482353b
+.quad  0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad  0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad  0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad  0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad  0xd192e819d6ef5218,0xd69906245565a910
+.quad  0xd192e819d6ef5218,0xd69906245565a910
+.quad  0xf40e35855771202a,0x106aa07032bbd1b8
+.quad  0xf40e35855771202a,0x106aa07032bbd1b8
+.quad  0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad  0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad  0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad  0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad  0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad  0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad  0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad  0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad  0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad  0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad  0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad  0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad  0x90befffa23631e28,0xa4506cebde82bde9
+.quad  0x90befffa23631e28,0xa4506cebde82bde9
+.quad  0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad  0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad  0xca273eceea26619c,0xd186b8c721c0c207
+.quad  0xca273eceea26619c,0xd186b8c721c0c207
+.quad  0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad  0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad  0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad  0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad  0x113f9804bef90dae,0x1b710b35131c471b
+.quad  0x113f9804bef90dae,0x1b710b35131c471b
+.quad  0x28db77f523047d84,0x32caab7b40c72493
+.quad  0x28db77f523047d84,0x32caab7b40c72493
+.quad  0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad  0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad  0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad  0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad  0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad  0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.quad  0x0001020304050607,0x08090a0b0c0d0e0f
+.quad  0x0001020304050607,0x08090a0b0c0d0e0f
+.byte  83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.type  sha512_block_data_order_xop,@function
 .align 64
-sha256_block_data_order_ssse3:
-.Lssse3_shortcut:
+sha512_block_data_order_xop:
+.cfi_startproc 
+.Lxop_shortcut:
+       movq    %rsp,%rax
+.cfi_def_cfa_register  %rax
        pushq   %rbx
+.cfi_offset    %rbx,-16
        pushq   %rbp
+.cfi_offset    %rbp,-24
        pushq   %r12
+.cfi_offset    %r12,-32
        pushq   %r13
+.cfi_offset    %r13,-40
        pushq   %r14
+.cfi_offset    %r14,-48
        pushq   %r15
-       movq    %rsp,%r11
+.cfi_offset    %r15,-56
        shlq    $4,%rdx
-       subq    $96,%rsp
-       leaq    (%rsi,%rdx,4),%rdx
+       subq    $160,%rsp
+       leaq    (%rsi,%rdx,8),%rdx
        andq    $-64,%rsp
-       movq    %rdi,64+0(%rsp)
-       movq    %rsi,64+8(%rsp)
-       movq    %rdx,64+16(%rsp)
-       movq    %r11,64+24(%rsp)
-.Lprologue_ssse3:
-
-       movl    0(%rdi),%eax
-       movl    4(%rdi),%ebx
-       movl    8(%rdi),%ecx
-       movl    12(%rdi),%edx
-       movl    16(%rdi),%r8d
-       movl    20(%rdi),%r9d
-       movl    24(%rdi),%r10d
-       movl    28(%rdi),%r11d
-       movdqa  K256+512+32(%rip),%xmm8
-       movdqa  K256+512+64(%rip),%xmm9
-       jmp     .Lloop_ssse3
+       movq    %rdi,128+0(%rsp)
+       movq    %rsi,128+8(%rsp)
+       movq    %rdx,128+16(%rsp)
+       movq    %rax,152(%rsp)
+.cfi_escape    0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_xop:
+
+       vzeroupper
+       movq    0(%rdi),%rax
+       movq    8(%rdi),%rbx
+       movq    16(%rdi),%rcx
+       movq    24(%rdi),%rdx
+       movq    32(%rdi),%r8
+       movq    40(%rdi),%r9
+       movq    48(%rdi),%r10
+       movq    56(%rdi),%r11
+       jmp     .Lloop_xop
 .align 16
-.Lloop_ssse3:
-       movdqa  K256+512(%rip),%xmm7
-       movdqu  0(%rsi),%xmm0
-       movdqu  16(%rsi),%xmm1
-       movdqu  32(%rsi),%xmm2
-       movdqu  48(%rsi),%xmm3
-.byte  102,15,56,0,199
-       leaq    K256(%rip),%rbp
-.byte  102,15,56,0,207
-       movdqa  0(%rbp),%xmm4
-.byte  102,15,56,0,215
-       movdqa  32(%rbp),%xmm5
-       paddd   %xmm0,%xmm4
-       movdqa  64(%rbp),%xmm6
-.byte  102,15,56,0,223
-       movdqa  96(%rbp),%xmm7
-       paddd   %xmm1,%xmm5
-       paddd   %xmm2,%xmm6
-       paddd   %xmm3,%xmm7
-       movdqa  %xmm4,0(%rsp)
-       movl    %eax,%r14d
-       movdqa  %xmm5,16(%rsp)
-       movl    %ebx,%edi
-       movdqa  %xmm6,32(%rsp)
-       xorl    %ecx,%edi
-       movdqa  %xmm7,48(%rsp)
-       movl    %r8d,%r13d
-       jmp     .Lssse3_00_47
+.Lloop_xop:
+       vmovdqa K512+1280(%rip),%xmm11
+       vmovdqu 0(%rsi),%xmm0
+       leaq    K512+128(%rip),%rbp
+       vmovdqu 16(%rsi),%xmm1
+       vmovdqu 32(%rsi),%xmm2
+       vpshufb %xmm11,%xmm0,%xmm0
+       vmovdqu 48(%rsi),%xmm3
+       vpshufb %xmm11,%xmm1,%xmm1
+       vmovdqu 64(%rsi),%xmm4
+       vpshufb %xmm11,%xmm2,%xmm2
+       vmovdqu 80(%rsi),%xmm5
+       vpshufb %xmm11,%xmm3,%xmm3
+       vmovdqu 96(%rsi),%xmm6
+       vpshufb %xmm11,%xmm4,%xmm4
+       vmovdqu 112(%rsi),%xmm7
+       vpshufb %xmm11,%xmm5,%xmm5
+       vpaddq  -128(%rbp),%xmm0,%xmm8
+       vpshufb %xmm11,%xmm6,%xmm6
+       vpaddq  -96(%rbp),%xmm1,%xmm9
+       vpshufb %xmm11,%xmm7,%xmm7
+       vpaddq  -64(%rbp),%xmm2,%xmm10
+       vpaddq  -32(%rbp),%xmm3,%xmm11
+       vmovdqa %xmm8,0(%rsp)
+       vpaddq  0(%rbp),%xmm4,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       vpaddq  32(%rbp),%xmm5,%xmm9
+       vmovdqa %xmm10,32(%rsp)
+       vpaddq  64(%rbp),%xmm6,%xmm10
+       vmovdqa %xmm11,48(%rsp)
+       vpaddq  96(%rbp),%xmm7,%xmm11
+       vmovdqa %xmm8,64(%rsp)
+       movq    %rax,%r14
+       vmovdqa %xmm9,80(%rsp)
+       movq    %rbx,%rdi
+       vmovdqa %xmm10,96(%rsp)
+       xorq    %rcx,%rdi
+       vmovdqa %xmm11,112(%rsp)
+       movq    %r8,%r13
+       jmp     .Lxop_00_47
 
 .align 16
-.Lssse3_00_47:
-       subq    $-32*4,%rbp
-       rorl    $14,%r13d
-       movl    %r14d,%eax
-       movdqa  %xmm1,%xmm4
-       movl    %r9d,%r12d
-       movdqa  %xmm3,%xmm7
-       xorl    %r8d,%r13d
-       rorl    $9,%r14d
-       xorl    %r10d,%r12d
-.byte  102,15,58,15,224,4
-       rorl    $5,%r13d
-       xorl    %eax,%r14d
-.byte  102,15,58,15,250,4
-       andl    %r8d,%r12d
-       xorl    %r8d,%r13d
-       addl    0(%rsp),%r11d
-       movl    %eax,%r15d
-       rorl    $11,%r14d
-       xorl    %r10d,%r12d
-       movdqa  %xmm4,%xmm5
-       xorl    %ebx,%r15d
-       movdqa  %xmm4,%xmm6
-       rorl    $6,%r13d
-       addl    %r12d,%r11d
-       andl    %r15d,%edi
-       psrld   $3,%xmm4
-       xorl    %eax,%r14d
-       addl    %r13d,%r11d
-       xorl    %ebx,%edi
-       paddd   %xmm7,%xmm0
-       addl    %r11d,%edx
-       rorl    $2,%r14d
-       addl    %edi,%r11d
-       psrld   $7,%xmm6
-       movl    %edx,%r13d
-       addl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r11d
-       pshufd  $250,%xmm3,%xmm7
-       movl    %r8d,%r12d
-       pslld   $14,%xmm5
-       xorl    %edx,%r13d
-       pxor    %xmm6,%xmm4
-       rorl    $9,%r14d
-       xorl    %r9d,%r12d
-       psrld   $11,%xmm6
-       rorl    $5,%r13d
-       xorl    %r11d,%r14d
-       pxor    %xmm5,%xmm4
-       andl    %edx,%r12d
-       xorl    %edx,%r13d
-       pslld   $11,%xmm5
-       addl    4(%rsp),%r10d
-       pxor    %xmm6,%xmm4
-       movl    %r11d,%edi
-       rorl    $11,%r14d
-       xorl    %r9d,%r12d
-       movdqa  %xmm7,%xmm6
-       xorl    %eax,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r10d
-       pxor    %xmm5,%xmm4
-       andl    %edi,%r15d
-       xorl    %r11d,%r14d
-       psrld   $10,%xmm7
-       addl    %r13d,%r10d
-       xorl    %eax,%r15d
-       paddd   %xmm4,%xmm0
-       addl    %r10d,%ecx
-       rorl    $2,%r14d
-       addl    %r15d,%r10d
-       movl    %ecx,%r13d
-       psrlq   $17,%xmm6
-       addl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r10d
-       movl    %edx,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %ecx,%r13d
-       rorl    $9,%r14d
-       psrlq   $2,%xmm6
-       xorl    %r8d,%r12d
-       rorl    $5,%r13d
-       xorl    %r10d,%r14d
-       pxor    %xmm6,%xmm7
-       andl    %ecx,%r12d
-       xorl    %ecx,%r13d
-       addl    8(%rsp),%r9d
-.byte  102,65,15,56,0,248
-       movl    %r10d,%r15d
-       rorl    $11,%r14d
-       xorl    %r8d,%r12d
-       xorl    %r11d,%r15d
-       rorl    $6,%r13d
-       paddd   %xmm7,%xmm0
-       addl    %r12d,%r9d
-       pshufd  $80,%xmm0,%xmm7
-       andl    %r15d,%edi
-       xorl    %r10d,%r14d
-       addl    %r13d,%r9d
-       xorl    %r11d,%edi
-       movdqa  %xmm7,%xmm6
-       addl    %r9d,%ebx
-       rorl    $2,%r14d
-       addl    %edi,%r9d
-       psrld   $10,%xmm7
-       movl    %ebx,%r13d
-       psrlq   $17,%xmm6
-       addl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r9d
-       movl    %ecx,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %ebx,%r13d
-       rorl    $9,%r14d
-       xorl    %edx,%r12d
-       psrlq   $2,%xmm6
-       rorl    $5,%r13d
-       xorl    %r9d,%r14d
-       andl    %ebx,%r12d
-       xorl    %ebx,%r13d
-       pxor    %xmm6,%xmm7
-       addl    12(%rsp),%r8d
-       movl    %r9d,%edi
-       movdqa  0(%rbp),%xmm6
-       rorl    $11,%r14d
-       xorl    %edx,%r12d
-.byte  102,65,15,56,0,249
-       xorl    %r10d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r8d
-       andl    %edi,%r15d
-       xorl    %r9d,%r14d
-       paddd   %xmm7,%xmm0
-       addl    %r13d,%r8d
-       xorl    %r10d,%r15d
-       addl    %r8d,%eax
-       paddd   %xmm0,%xmm6
-       rorl    $2,%r14d
-       addl    %r15d,%r8d
-       movl    %eax,%r13d
-       addl    %r8d,%r14d
-       movdqa  %xmm6,0(%rsp)
-       rorl    $14,%r13d
-       movl    %r14d,%r8d
-       movdqa  %xmm2,%xmm4
-       movl    %ebx,%r12d
-       movdqa  %xmm0,%xmm7
-       xorl    %eax,%r13d
-       rorl    $9,%r14d
-       xorl    %ecx,%r12d
-.byte  102,15,58,15,225,4
-       rorl    $5,%r13d
-       xorl    %r8d,%r14d
-.byte  102,15,58,15,251,4
-       andl    %eax,%r12d
-       xorl    %eax,%r13d
-       addl    16(%rsp),%edx
-       movl    %r8d,%r15d
-       rorl    $11,%r14d
-       xorl    %ecx,%r12d
-       movdqa  %xmm4,%xmm5
-       xorl    %r9d,%r15d
-       movdqa  %xmm4,%xmm6
-       rorl    $6,%r13d
-       addl    %r12d,%edx
-       andl    %r15d,%edi
-       psrld   $3,%xmm4
-       xorl    %r8d,%r14d
-       addl    %r13d,%edx
-       xorl    %r9d,%edi
-       paddd   %xmm7,%xmm1
-       addl    %edx,%r11d
-       rorl    $2,%r14d
-       addl    %edi,%edx
-       psrld   $7,%xmm6
-       movl    %r11d,%r13d
-       addl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%edx
-       pshufd  $250,%xmm0,%xmm7
-       movl    %eax,%r12d
-       pslld   $14,%xmm5
-       xorl    %r11d,%r13d
-       pxor    %xmm6,%xmm4
-       rorl    $9,%r14d
-       xorl    %ebx,%r12d
-       psrld   $11,%xmm6
-       rorl    $5,%r13d
-       xorl    %edx,%r14d
-       pxor    %xmm5,%xmm4
-       andl    %r11d,%r12d
-       xorl    %r11d,%r13d
-       pslld   $11,%xmm5
-       addl    20(%rsp),%ecx
-       pxor    %xmm6,%xmm4
-       movl    %edx,%edi
-       rorl    $11,%r14d
-       xorl    %ebx,%r12d
-       movdqa  %xmm7,%xmm6
-       xorl    %r8d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%ecx
-       pxor    %xmm5,%xmm4
-       andl    %edi,%r15d
-       xorl    %edx,%r14d
-       psrld   $10,%xmm7
-       addl    %r13d,%ecx
-       xorl    %r8d,%r15d
-       paddd   %xmm4,%xmm1
-       addl    %ecx,%r10d
-       rorl    $2,%r14d
-       addl    %r15d,%ecx
-       movl    %r10d,%r13d
-       psrlq   $17,%xmm6
-       addl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ecx
-       movl    %r11d,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %r10d,%r13d
-       rorl    $9,%r14d
-       psrlq   $2,%xmm6
-       xorl    %eax,%r12d
-       rorl    $5,%r13d
-       xorl    %ecx,%r14d
-       pxor    %xmm6,%xmm7
-       andl    %r10d,%r12d
-       xorl    %r10d,%r13d
-       addl    24(%rsp),%ebx
-.byte  102,65,15,56,0,248
-       movl    %ecx,%r15d
-       rorl    $11,%r14d
-       xorl    %eax,%r12d
-       xorl    %edx,%r15d
-       rorl    $6,%r13d
-       paddd   %xmm7,%xmm1
-       addl    %r12d,%ebx
-       pshufd  $80,%xmm1,%xmm7
-       andl    %r15d,%edi
-       xorl    %ecx,%r14d
-       addl    %r13d,%ebx
-       xorl    %edx,%edi
-       movdqa  %xmm7,%xmm6
-       addl    %ebx,%r9d
-       rorl    $2,%r14d
-       addl    %edi,%ebx
-       psrld   $10,%xmm7
-       movl    %r9d,%r13d
-       psrlq   $17,%xmm6
-       addl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ebx
-       movl    %r10d,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %r9d,%r13d
-       rorl    $9,%r14d
-       xorl    %r11d,%r12d
-       psrlq   $2,%xmm6
-       rorl    $5,%r13d
-       xorl    %ebx,%r14d
-       andl    %r9d,%r12d
-       xorl    %r9d,%r13d
-       pxor    %xmm6,%xmm7
-       addl    28(%rsp),%eax
-       movl    %ebx,%edi
-       movdqa  32(%rbp),%xmm6
-       rorl    $11,%r14d
-       xorl    %r11d,%r12d
-.byte  102,65,15,56,0,249
-       xorl    %ecx,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%eax
-       andl    %edi,%r15d
-       xorl    %ebx,%r14d
-       paddd   %xmm7,%xmm1
-       addl    %r13d,%eax
-       xorl    %ecx,%r15d
-       addl    %eax,%r8d
-       paddd   %xmm1,%xmm6
-       rorl    $2,%r14d
-       addl    %r15d,%eax
-       movl    %r8d,%r13d
-       addl    %eax,%r14d
-       movdqa  %xmm6,16(%rsp)
-       rorl    $14,%r13d
-       movl    %r14d,%eax
-       movdqa  %xmm3,%xmm4
-       movl    %r9d,%r12d
-       movdqa  %xmm1,%xmm7
-       xorl    %r8d,%r13d
-       rorl    $9,%r14d
-       xorl    %r10d,%r12d
-.byte  102,15,58,15,226,4
-       rorl    $5,%r13d
-       xorl    %eax,%r14d
-.byte  102,15,58,15,248,4
-       andl    %r8d,%r12d
-       xorl    %r8d,%r13d
-       addl    32(%rsp),%r11d
-       movl    %eax,%r15d
-       rorl    $11,%r14d
-       xorl    %r10d,%r12d
-       movdqa  %xmm4,%xmm5
-       xorl    %ebx,%r15d
-       movdqa  %xmm4,%xmm6
-       rorl    $6,%r13d
-       addl    %r12d,%r11d
-       andl    %r15d,%edi
-       psrld   $3,%xmm4
-       xorl    %eax,%r14d
-       addl    %r13d,%r11d
-       xorl    %ebx,%edi
-       paddd   %xmm7,%xmm2
-       addl    %r11d,%edx
-       rorl    $2,%r14d
-       addl    %edi,%r11d
-       psrld   $7,%xmm6
-       movl    %edx,%r13d
-       addl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r11d
-       pshufd  $250,%xmm1,%xmm7
-       movl    %r8d,%r12d
-       pslld   $14,%xmm5
-       xorl    %edx,%r13d
-       pxor    %xmm6,%xmm4
-       rorl    $9,%r14d
-       xorl    %r9d,%r12d
-       psrld   $11,%xmm6
-       rorl    $5,%r13d
-       xorl    %r11d,%r14d
-       pxor    %xmm5,%xmm4
-       andl    %edx,%r12d
-       xorl    %edx,%r13d
-       pslld   $11,%xmm5
-       addl    36(%rsp),%r10d
-       pxor    %xmm6,%xmm4
-       movl    %r11d,%edi
-       rorl    $11,%r14d
-       xorl    %r9d,%r12d
-       movdqa  %xmm7,%xmm6
-       xorl    %eax,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r10d
-       pxor    %xmm5,%xmm4
-       andl    %edi,%r15d
-       xorl    %r11d,%r14d
-       psrld   $10,%xmm7
-       addl    %r13d,%r10d
-       xorl    %eax,%r15d
-       paddd   %xmm4,%xmm2
-       addl    %r10d,%ecx
-       rorl    $2,%r14d
-       addl    %r15d,%r10d
-       movl    %ecx,%r13d
-       psrlq   $17,%xmm6
-       addl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r10d
-       movl    %edx,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %ecx,%r13d
-       rorl    $9,%r14d
-       psrlq   $2,%xmm6
-       xorl    %r8d,%r12d
-       rorl    $5,%r13d
-       xorl    %r10d,%r14d
-       pxor    %xmm6,%xmm7
-       andl    %ecx,%r12d
-       xorl    %ecx,%r13d
-       addl    40(%rsp),%r9d
-.byte  102,65,15,56,0,248
-       movl    %r10d,%r15d
-       rorl    $11,%r14d
-       xorl    %r8d,%r12d
-       xorl    %r11d,%r15d
-       rorl    $6,%r13d
-       paddd   %xmm7,%xmm2
-       addl    %r12d,%r9d
-       pshufd  $80,%xmm2,%xmm7
-       andl    %r15d,%edi
-       xorl    %r10d,%r14d
-       addl    %r13d,%r9d
-       xorl    %r11d,%edi
-       movdqa  %xmm7,%xmm6
-       addl    %r9d,%ebx
-       rorl    $2,%r14d
-       addl    %edi,%r9d
-       psrld   $10,%xmm7
-       movl    %ebx,%r13d
-       psrlq   $17,%xmm6
-       addl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r9d
-       movl    %ecx,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %ebx,%r13d
-       rorl    $9,%r14d
-       xorl    %edx,%r12d
-       psrlq   $2,%xmm6
-       rorl    $5,%r13d
-       xorl    %r9d,%r14d
-       andl    %ebx,%r12d
-       xorl    %ebx,%r13d
-       pxor    %xmm6,%xmm7
-       addl    44(%rsp),%r8d
-       movl    %r9d,%edi
-       movdqa  64(%rbp),%xmm6
-       rorl    $11,%r14d
-       xorl    %edx,%r12d
-.byte  102,65,15,56,0,249
-       xorl    %r10d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r8d
-       andl    %edi,%r15d
-       xorl    %r9d,%r14d
-       paddd   %xmm7,%xmm2
-       addl    %r13d,%r8d
-       xorl    %r10d,%r15d
-       addl    %r8d,%eax
-       paddd   %xmm2,%xmm6
-       rorl    $2,%r14d
-       addl    %r15d,%r8d
-       movl    %eax,%r13d
-       addl    %r8d,%r14d
-       movdqa  %xmm6,32(%rsp)
-       rorl    $14,%r13d
-       movl    %r14d,%r8d
-       movdqa  %xmm0,%xmm4
-       movl    %ebx,%r12d
-       movdqa  %xmm2,%xmm7
-       xorl    %eax,%r13d
-       rorl    $9,%r14d
-       xorl    %ecx,%r12d
-.byte  102,15,58,15,227,4
-       rorl    $5,%r13d
-       xorl    %r8d,%r14d
-.byte  102,15,58,15,249,4
-       andl    %eax,%r12d
-       xorl    %eax,%r13d
-       addl    48(%rsp),%edx
-       movl    %r8d,%r15d
-       rorl    $11,%r14d
-       xorl    %ecx,%r12d
-       movdqa  %xmm4,%xmm5
-       xorl    %r9d,%r15d
-       movdqa  %xmm4,%xmm6
-       rorl    $6,%r13d
-       addl    %r12d,%edx
-       andl    %r15d,%edi
-       psrld   $3,%xmm4
-       xorl    %r8d,%r14d
-       addl    %r13d,%edx
-       xorl    %r9d,%edi
-       paddd   %xmm7,%xmm3
-       addl    %edx,%r11d
-       rorl    $2,%r14d
-       addl    %edi,%edx
-       psrld   $7,%xmm6
-       movl    %r11d,%r13d
-       addl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%edx
-       pshufd  $250,%xmm2,%xmm7
-       movl    %eax,%r12d
-       pslld   $14,%xmm5
-       xorl    %r11d,%r13d
-       pxor    %xmm6,%xmm4
-       rorl    $9,%r14d
-       xorl    %ebx,%r12d
-       psrld   $11,%xmm6
-       rorl    $5,%r13d
-       xorl    %edx,%r14d
-       pxor    %xmm5,%xmm4
-       andl    %r11d,%r12d
-       xorl    %r11d,%r13d
-       pslld   $11,%xmm5
-       addl    52(%rsp),%ecx
-       pxor    %xmm6,%xmm4
-       movl    %edx,%edi
-       rorl    $11,%r14d
-       xorl    %ebx,%r12d
-       movdqa  %xmm7,%xmm6
-       xorl    %r8d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%ecx
-       pxor    %xmm5,%xmm4
-       andl    %edi,%r15d
-       xorl    %edx,%r14d
-       psrld   $10,%xmm7
-       addl    %r13d,%ecx
-       xorl    %r8d,%r15d
-       paddd   %xmm4,%xmm3
-       addl    %ecx,%r10d
-       rorl    $2,%r14d
-       addl    %r15d,%ecx
-       movl    %r10d,%r13d
-       psrlq   $17,%xmm6
-       addl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ecx
-       movl    %r11d,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %r10d,%r13d
-       rorl    $9,%r14d
-       psrlq   $2,%xmm6
-       xorl    %eax,%r12d
-       rorl    $5,%r13d
-       xorl    %ecx,%r14d
-       pxor    %xmm6,%xmm7
-       andl    %r10d,%r12d
-       xorl    %r10d,%r13d
-       addl    56(%rsp),%ebx
-.byte  102,65,15,56,0,248
-       movl    %ecx,%r15d
-       rorl    $11,%r14d
-       xorl    %eax,%r12d
-       xorl    %edx,%r15d
-       rorl    $6,%r13d
-       paddd   %xmm7,%xmm3
-       addl    %r12d,%ebx
-       pshufd  $80,%xmm3,%xmm7
-       andl    %r15d,%edi
-       xorl    %ecx,%r14d
-       addl    %r13d,%ebx
-       xorl    %edx,%edi
-       movdqa  %xmm7,%xmm6
-       addl    %ebx,%r9d
-       rorl    $2,%r14d
-       addl    %edi,%ebx
-       psrld   $10,%xmm7
-       movl    %r9d,%r13d
-       psrlq   $17,%xmm6
-       addl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ebx
-       movl    %r10d,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %r9d,%r13d
-       rorl    $9,%r14d
-       xorl    %r11d,%r12d
-       psrlq   $2,%xmm6
-       rorl    $5,%r13d
-       xorl    %ebx,%r14d
-       andl    %r9d,%r12d
-       xorl    %r9d,%r13d
-       pxor    %xmm6,%xmm7
-       addl    60(%rsp),%eax
-       movl    %ebx,%edi
-       movdqa  96(%rbp),%xmm6
-       rorl    $11,%r14d
-       xorl    %r11d,%r12d
-.byte  102,65,15,56,0,249
-       xorl    %ecx,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%eax
-       andl    %edi,%r15d
-       xorl    %ebx,%r14d
-       paddd   %xmm7,%xmm3
-       addl    %r13d,%eax
-       xorl    %ecx,%r15d
-       addl    %eax,%r8d
-       paddd   %xmm3,%xmm6
-       rorl    $2,%r14d
-       addl    %r15d,%eax
-       movl    %r8d,%r13d
-       addl    %eax,%r14d
-       movdqa  %xmm6,48(%rsp)
-       cmpb    $0,131(%rbp)
-       jne     .Lssse3_00_47
-       rorl    $14,%r13d
-       movl    %r14d,%eax
-       movl    %r9d,%r12d
-       xorl    %r8d,%r13d
-       rorl    $9,%r14d
-       xorl    %r10d,%r12d
-       rorl    $5,%r13d
-       xorl    %eax,%r14d
-       andl    %r8d,%r12d
-       xorl    %r8d,%r13d
-       addl    0(%rsp),%r11d
-       movl    %eax,%r15d
-       rorl    $11,%r14d
-       xorl    %r10d,%r12d
-       xorl    %ebx,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%r11d
-       andl    %r15d,%edi
-       xorl    %eax,%r14d
-       addl    %r13d,%r11d
-       xorl    %ebx,%edi
-       addl    %r11d,%edx
-       rorl    $2,%r14d
-       addl    %edi,%r11d
-       movl    %edx,%r13d
-       addl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r11d
-       movl    %r8d,%r12d
-       xorl    %edx,%r13d
-       rorl    $9,%r14d
-       xorl    %r9d,%r12d
-       rorl    $5,%r13d
-       xorl    %r11d,%r14d
-       andl    %edx,%r12d
-       xorl    %edx,%r13d
-       addl    4(%rsp),%r10d
-       movl    %r11d,%edi
-       rorl    $11,%r14d
-       xorl    %r9d,%r12d
-       xorl    %eax,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r10d
-       andl    %edi,%r15d
-       xorl    %r11d,%r14d
-       addl    %r13d,%r10d
-       xorl    %eax,%r15d
-       addl    %r10d,%ecx
-       rorl    $2,%r14d
-       addl    %r15d,%r10d
-       movl    %ecx,%r13d
-       addl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r10d
-       movl    %edx,%r12d
-       xorl    %ecx,%r13d
-       rorl    $9,%r14d
-       xorl    %r8d,%r12d
-       rorl    $5,%r13d
-       xorl    %r10d,%r14d
-       andl    %ecx,%r12d
-       xorl    %ecx,%r13d
-       addl    8(%rsp),%r9d
-       movl    %r10d,%r15d
-       rorl    $11,%r14d
-       xorl    %r8d,%r12d
-       xorl    %r11d,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%r9d
-       andl    %r15d,%edi
-       xorl    %r10d,%r14d
-       addl    %r13d,%r9d
-       xorl    %r11d,%edi
-       addl    %r9d,%ebx
-       rorl    $2,%r14d
-       addl    %edi,%r9d
-       movl    %ebx,%r13d
-       addl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r9d
-       movl    %ecx,%r12d
-       xorl    %ebx,%r13d
-       rorl    $9,%r14d
-       xorl    %edx,%r12d
-       rorl    $5,%r13d
-       xorl    %r9d,%r14d
-       andl    %ebx,%r12d
-       xorl    %ebx,%r13d
-       addl    12(%rsp),%r8d
-       movl    %r9d,%edi
-       rorl    $11,%r14d
-       xorl    %edx,%r12d
-       xorl    %r10d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r8d
-       andl    %edi,%r15d
-       xorl    %r9d,%r14d
-       addl    %r13d,%r8d
-       xorl    %r10d,%r15d
-       addl    %r8d,%eax
-       rorl    $2,%r14d
-       addl    %r15d,%r8d
-       movl    %eax,%r13d
-       addl    %r8d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r8d
-       movl    %ebx,%r12d
-       xorl    %eax,%r13d
-       rorl    $9,%r14d
-       xorl    %ecx,%r12d
-       rorl    $5,%r13d
-       xorl    %r8d,%r14d
-       andl    %eax,%r12d
-       xorl    %eax,%r13d
-       addl    16(%rsp),%edx
-       movl    %r8d,%r15d
-       rorl    $11,%r14d
-       xorl    %ecx,%r12d
-       xorl    %r9d,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%edx
-       andl    %r15d,%edi
-       xorl    %r8d,%r14d
-       addl    %r13d,%edx
-       xorl    %r9d,%edi
-       addl    %edx,%r11d
-       rorl    $2,%r14d
-       addl    %edi,%edx
-       movl    %r11d,%r13d
-       addl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%edx
-       movl    %eax,%r12d
-       xorl    %r11d,%r13d
-       rorl    $9,%r14d
-       xorl    %ebx,%r12d
-       rorl    $5,%r13d
-       xorl    %edx,%r14d
-       andl    %r11d,%r12d
-       xorl    %r11d,%r13d
-       addl    20(%rsp),%ecx
-       movl    %edx,%edi
-       rorl    $11,%r14d
-       xorl    %ebx,%r12d
-       xorl    %r8d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%ecx
-       andl    %edi,%r15d
-       xorl    %edx,%r14d
-       addl    %r13d,%ecx
-       xorl    %r8d,%r15d
-       addl    %ecx,%r10d
-       rorl    $2,%r14d
-       addl    %r15d,%ecx
-       movl    %r10d,%r13d
-       addl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ecx
-       movl    %r11d,%r12d
-       xorl    %r10d,%r13d
-       rorl    $9,%r14d
-       xorl    %eax,%r12d
-       rorl    $5,%r13d
-       xorl    %ecx,%r14d
-       andl    %r10d,%r12d
-       xorl    %r10d,%r13d
-       addl    24(%rsp),%ebx
-       movl    %ecx,%r15d
-       rorl    $11,%r14d
-       xorl    %eax,%r12d
-       xorl    %edx,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%ebx
-       andl    %r15d,%edi
-       xorl    %ecx,%r14d
-       addl    %r13d,%ebx
-       xorl    %edx,%edi
-       addl    %ebx,%r9d
-       rorl    $2,%r14d
-       addl    %edi,%ebx
-       movl    %r9d,%r13d
-       addl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ebx
-       movl    %r10d,%r12d
-       xorl    %r9d,%r13d
-       rorl    $9,%r14d
-       xorl    %r11d,%r12d
-       rorl    $5,%r13d
-       xorl    %ebx,%r14d
-       andl    %r9d,%r12d
-       xorl    %r9d,%r13d
-       addl    28(%rsp),%eax
-       movl    %ebx,%edi
-       rorl    $11,%r14d
-       xorl    %r11d,%r12d
-       xorl    %ecx,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%eax
-       andl    %edi,%r15d
-       xorl    %ebx,%r14d
-       addl    %r13d,%eax
-       xorl    %ecx,%r15d
-       addl    %eax,%r8d
-       rorl    $2,%r14d
-       addl    %r15d,%eax
-       movl    %r8d,%r13d
-       addl    %eax,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%eax
-       movl    %r9d,%r12d
-       xorl    %r8d,%r13d
-       rorl    $9,%r14d
-       xorl    %r10d,%r12d
-       rorl    $5,%r13d
-       xorl    %eax,%r14d
-       andl    %r8d,%r12d
-       xorl    %r8d,%r13d
-       addl    32(%rsp),%r11d
-       movl    %eax,%r15d
-       rorl    $11,%r14d
-       xorl    %r10d,%r12d
-       xorl    %ebx,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%r11d
-       andl    %r15d,%edi
-       xorl    %eax,%r14d
-       addl    %r13d,%r11d
-       xorl    %ebx,%edi
-       addl    %r11d,%edx
-       rorl    $2,%r14d
-       addl    %edi,%r11d
-       movl    %edx,%r13d
-       addl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r11d
-       movl    %r8d,%r12d
-       xorl    %edx,%r13d
-       rorl    $9,%r14d
-       xorl    %r9d,%r12d
-       rorl    $5,%r13d
-       xorl    %r11d,%r14d
-       andl    %edx,%r12d
-       xorl    %edx,%r13d
-       addl    36(%rsp),%r10d
-       movl    %r11d,%edi
-       rorl    $11,%r14d
-       xorl    %r9d,%r12d
-       xorl    %eax,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r10d
-       andl    %edi,%r15d
-       xorl    %r11d,%r14d
-       addl    %r13d,%r10d
-       xorl    %eax,%r15d
-       addl    %r10d,%ecx
-       rorl    $2,%r14d
-       addl    %r15d,%r10d
-       movl    %ecx,%r13d
-       addl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r10d
-       movl    %edx,%r12d
-       xorl    %ecx,%r13d
-       rorl    $9,%r14d
-       xorl    %r8d,%r12d
-       rorl    $5,%r13d
-       xorl    %r10d,%r14d
-       andl    %ecx,%r12d
-       xorl    %ecx,%r13d
-       addl    40(%rsp),%r9d
-       movl    %r10d,%r15d
-       rorl    $11,%r14d
-       xorl    %r8d,%r12d
-       xorl    %r11d,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%r9d
-       andl    %r15d,%edi
-       xorl    %r10d,%r14d
-       addl    %r13d,%r9d
-       xorl    %r11d,%edi
-       addl    %r9d,%ebx
-       rorl    $2,%r14d
-       addl    %edi,%r9d
-       movl    %ebx,%r13d
-       addl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r9d
-       movl    %ecx,%r12d
-       xorl    %ebx,%r13d
-       rorl    $9,%r14d
-       xorl    %edx,%r12d
-       rorl    $5,%r13d
-       xorl    %r9d,%r14d
-       andl    %ebx,%r12d
-       xorl    %ebx,%r13d
-       addl    44(%rsp),%r8d
-       movl    %r9d,%edi
-       rorl    $11,%r14d
-       xorl    %edx,%r12d
-       xorl    %r10d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r8d
-       andl    %edi,%r15d
-       xorl    %r9d,%r14d
-       addl    %r13d,%r8d
-       xorl    %r10d,%r15d
-       addl    %r8d,%eax
-       rorl    $2,%r14d
-       addl    %r15d,%r8d
-       movl    %eax,%r13d
-       addl    %r8d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r8d
-       movl    %ebx,%r12d
-       xorl    %eax,%r13d
-       rorl    $9,%r14d
-       xorl    %ecx,%r12d
-       rorl    $5,%r13d
-       xorl    %r8d,%r14d
-       andl    %eax,%r12d
-       xorl    %eax,%r13d
-       addl    48(%rsp),%edx
-       movl    %r8d,%r15d
-       rorl    $11,%r14d
-       xorl    %ecx,%r12d
-       xorl    %r9d,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%edx
-       andl    %r15d,%edi
-       xorl    %r8d,%r14d
-       addl    %r13d,%edx
-       xorl    %r9d,%edi
-       addl    %edx,%r11d
-       rorl    $2,%r14d
-       addl    %edi,%edx
-       movl    %r11d,%r13d
-       addl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%edx
-       movl    %eax,%r12d
-       xorl    %r11d,%r13d
-       rorl    $9,%r14d
-       xorl    %ebx,%r12d
-       rorl    $5,%r13d
-       xorl    %edx,%r14d
-       andl    %r11d,%r12d
-       xorl    %r11d,%r13d
-       addl    52(%rsp),%ecx
-       movl    %edx,%edi
-       rorl    $11,%r14d
-       xorl    %ebx,%r12d
-       xorl    %r8d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%ecx
-       andl    %edi,%r15d
-       xorl    %edx,%r14d
-       addl    %r13d,%ecx
-       xorl    %r8d,%r15d
-       addl    %ecx,%r10d
-       rorl    $2,%r14d
-       addl    %r15d,%ecx
-       movl    %r10d,%r13d
-       addl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ecx
-       movl    %r11d,%r12d
-       xorl    %r10d,%r13d
-       rorl    $9,%r14d
-       xorl    %eax,%r12d
-       rorl    $5,%r13d
-       xorl    %ecx,%r14d
-       andl    %r10d,%r12d
-       xorl    %r10d,%r13d
-       addl    56(%rsp),%ebx
-       movl    %ecx,%r15d
-       rorl    $11,%r14d
-       xorl    %eax,%r12d
-       xorl    %edx,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%ebx
-       andl    %r15d,%edi
-       xorl    %ecx,%r14d
-       addl    %r13d,%ebx
-       xorl    %edx,%edi
-       addl    %ebx,%r9d
-       rorl    $2,%r14d
-       addl    %edi,%ebx
-       movl    %r9d,%r13d
-       addl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ebx
-       movl    %r10d,%r12d
-       xorl    %r9d,%r13d
-       rorl    $9,%r14d
-       xorl    %r11d,%r12d
-       rorl    $5,%r13d
-       xorl    %ebx,%r14d
-       andl    %r9d,%r12d
-       xorl    %r9d,%r13d
-       addl    60(%rsp),%eax
-       movl    %ebx,%edi
-       rorl    $11,%r14d
-       xorl    %r11d,%r12d
-       xorl    %ecx,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%eax
-       andl    %edi,%r15d
-       xorl    %ebx,%r14d
-       addl    %r13d,%eax
-       xorl    %ecx,%r15d
-       addl    %eax,%r8d
-       rorl    $2,%r14d
-       addl    %r15d,%eax
-       movl    %r8d,%r13d
-       addl    %eax,%r14d
-       movq    64+0(%rsp),%rdi
-       movl    %r14d,%eax
-
-       addl    0(%rdi),%eax
-       leaq    64(%rsi),%rsi
-       addl    4(%rdi),%ebx
-       addl    8(%rdi),%ecx
-       addl    12(%rdi),%edx
-       addl    16(%rdi),%r8d
-       addl    20(%rdi),%r9d
-       addl    24(%rdi),%r10d
-       addl    28(%rdi),%r11d
-
-       cmpq    64+16(%rsp),%rsi
-
-       movl    %eax,0(%rdi)
-       movl    %ebx,4(%rdi)
-       movl    %ecx,8(%rdi)
-       movl    %edx,12(%rdi)
-       movl    %r8d,16(%rdi)
-       movl    %r9d,20(%rdi)
-       movl    %r10d,24(%rdi)
-       movl    %r11d,28(%rdi)
-       jb      .Lloop_ssse3
-
-       movq    64+24(%rsp),%rsi
-       movq    (%rsi),%r15
-       movq    8(%rsi),%r14
-       movq    16(%rsi),%r13
-       movq    24(%rsi),%r12
-       movq    32(%rsi),%rbp
-       movq    40(%rsi),%rbx
-       leaq    48(%rsi),%rsp
-.Lepilogue_ssse3:
+.Lxop_00_47:
+       addq    $256,%rbp
+       vpalignr        $8,%xmm0,%xmm1,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%rax
+       vpalignr        $8,%xmm4,%xmm5,%xmm11
+       movq    %r9,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %rax,%r14
+       vpaddq  %xmm11,%xmm0,%xmm0
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    0(%rsp),%r11
+       movq    %rax,%r15
+.byte  143,72,120,195,209,7
+       xorq    %r10,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,223,3
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rbx,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm7,%xmm10
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       vpaddq  %xmm8,%xmm0,%xmm0
+       movq    %rdx,%r13
+       addq    %r11,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%r11
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %r8,%r12
+       rorq    $5,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       vpaddq  %xmm11,%xmm0,%xmm0
+       addq    8(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       rorq    $6,%r14
+       vpaddq  -128(%rbp),%xmm0,%xmm10
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       rorq    $28,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       vmovdqa %xmm10,0(%rsp)
+       vpalignr        $8,%xmm1,%xmm2,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%r10
+       vpalignr        $8,%xmm5,%xmm6,%xmm11
+       movq    %rdx,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %r10,%r14
+       vpaddq  %xmm11,%xmm1,%xmm1
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    16(%rsp),%r9
+       movq    %r10,%r15
+.byte  143,72,120,195,209,7
+       xorq    %r8,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,216,3
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r11,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm0,%xmm10
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       vpaddq  %xmm8,%xmm1,%xmm1
+       movq    %rbx,%r13
+       addq    %r9,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%r9
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %rcx,%r12
+       rorq    $5,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       vpaddq  %xmm11,%xmm1,%xmm1
+       addq    24(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       rorq    $6,%r14
+       vpaddq  -96(%rbp),%xmm1,%xmm10
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       rorq    $28,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       vmovdqa %xmm10,16(%rsp)
+       vpalignr        $8,%xmm2,%xmm3,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%r8
+       vpalignr        $8,%xmm6,%xmm7,%xmm11
+       movq    %rbx,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %r8,%r14
+       vpaddq  %xmm11,%xmm2,%xmm2
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    32(%rsp),%rdx
+       movq    %r8,%r15
+.byte  143,72,120,195,209,7
+       xorq    %rcx,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,217,3
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r9,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm1,%xmm10
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       vpaddq  %xmm8,%xmm2,%xmm2
+       movq    %r11,%r13
+       addq    %rdx,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%rdx
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %rax,%r12
+       rorq    $5,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       vpaddq  %xmm11,%xmm2,%xmm2
+       addq    40(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       rorq    $6,%r14
+       vpaddq  -64(%rbp),%xmm2,%xmm10
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       rorq    $28,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       vmovdqa %xmm10,32(%rsp)
+       vpalignr        $8,%xmm3,%xmm4,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%rcx
+       vpalignr        $8,%xmm7,%xmm0,%xmm11
+       movq    %r11,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %rcx,%r14
+       vpaddq  %xmm11,%xmm3,%xmm3
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    48(%rsp),%rbx
+       movq    %rcx,%r15
+.byte  143,72,120,195,209,7
+       xorq    %rax,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,218,3
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rdx,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm2,%xmm10
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       vpaddq  %xmm8,%xmm3,%xmm3
+       movq    %r9,%r13
+       addq    %rbx,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%rbx
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %r10,%r12
+       rorq    $5,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       vpaddq  %xmm11,%xmm3,%xmm3
+       addq    56(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       rorq    $6,%r14
+       vpaddq  -32(%rbp),%xmm3,%xmm10
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       rorq    $28,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       vmovdqa %xmm10,48(%rsp)
+       vpalignr        $8,%xmm4,%xmm5,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%rax
+       vpalignr        $8,%xmm0,%xmm1,%xmm11
+       movq    %r9,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %rax,%r14
+       vpaddq  %xmm11,%xmm4,%xmm4
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    64(%rsp),%r11
+       movq    %rax,%r15
+.byte  143,72,120,195,209,7
+       xorq    %r10,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,219,3
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rbx,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm3,%xmm10
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       vpaddq  %xmm8,%xmm4,%xmm4
+       movq    %rdx,%r13
+       addq    %r11,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%r11
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %r8,%r12
+       rorq    $5,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       vpaddq  %xmm11,%xmm4,%xmm4
+       addq    72(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       rorq    $6,%r14
+       vpaddq  0(%rbp),%xmm4,%xmm10
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       rorq    $28,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       vmovdqa %xmm10,64(%rsp)
+       vpalignr        $8,%xmm5,%xmm6,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%r10
+       vpalignr        $8,%xmm1,%xmm2,%xmm11
+       movq    %rdx,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %r10,%r14
+       vpaddq  %xmm11,%xmm5,%xmm5
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    80(%rsp),%r9
+       movq    %r10,%r15
+.byte  143,72,120,195,209,7
+       xorq    %r8,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,220,3
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r11,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm4,%xmm10
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       vpaddq  %xmm8,%xmm5,%xmm5
+       movq    %rbx,%r13
+       addq    %r9,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%r9
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %rcx,%r12
+       rorq    $5,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       vpaddq  %xmm11,%xmm5,%xmm5
+       addq    88(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       rorq    $6,%r14
+       vpaddq  32(%rbp),%xmm5,%xmm10
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       rorq    $28,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       vmovdqa %xmm10,80(%rsp)
+       vpalignr        $8,%xmm6,%xmm7,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%r8
+       vpalignr        $8,%xmm2,%xmm3,%xmm11
+       movq    %rbx,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %r8,%r14
+       vpaddq  %xmm11,%xmm6,%xmm6
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    96(%rsp),%rdx
+       movq    %r8,%r15
+.byte  143,72,120,195,209,7
+       xorq    %rcx,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,221,3
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r9,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm5,%xmm10
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       vpaddq  %xmm8,%xmm6,%xmm6
+       movq    %r11,%r13
+       addq    %rdx,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%rdx
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %rax,%r12
+       rorq    $5,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       vpaddq  %xmm11,%xmm6,%xmm6
+       addq    104(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       rorq    $6,%r14
+       vpaddq  64(%rbp),%xmm6,%xmm10
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       rorq    $28,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       vmovdqa %xmm10,96(%rsp)
+       vpalignr        $8,%xmm7,%xmm0,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%rcx
+       vpalignr        $8,%xmm3,%xmm4,%xmm11
+       movq    %r11,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %rcx,%r14
+       vpaddq  %xmm11,%xmm7,%xmm7
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    112(%rsp),%rbx
+       movq    %rcx,%r15
+.byte  143,72,120,195,209,7
+       xorq    %rax,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,222,3
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rdx,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm6,%xmm10
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       vpaddq  %xmm8,%xmm7,%xmm7
+       movq    %r9,%r13
+       addq    %rbx,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%rbx
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %r10,%r12
+       rorq    $5,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       vpaddq  %xmm11,%xmm7,%xmm7
+       addq    120(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       rorq    $6,%r14
+       vpaddq  96(%rbp),%xmm7,%xmm10
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       rorq    $28,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       vmovdqa %xmm10,112(%rsp)
+       cmpb    $0,135(%rbp)
+       jne     .Lxop_00_47
+       rorq    $23,%r13
+       movq    %r14,%rax
+       movq    %r9,%r12
+       rorq    $5,%r14
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       rorq    $4,%r13
+       xorq    %rax,%r14
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    0(%rsp),%r11
+       movq    %rax,%r15
+       xorq    %r10,%r12
+       rorq    $6,%r14
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       xorq    %rbx,%rdi
+       rorq    $28,%r14
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       rorq    $23,%r13
+       movq    %r14,%r11
+       movq    %r8,%r12
+       rorq    $5,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       rorq    $4,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       addq    8(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       rorq    $6,%r14
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       rorq    $28,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       rorq    $23,%r13
+       movq    %r14,%r10
+       movq    %rdx,%r12
+       rorq    $5,%r14
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       rorq    $4,%r13
+       xorq    %r10,%r14
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    16(%rsp),%r9
+       movq    %r10,%r15
+       xorq    %r8,%r12
+       rorq    $6,%r14
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       xorq    %r11,%rdi
+       rorq    $28,%r14
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       rorq    $23,%r13
+       movq    %r14,%r9
+       movq    %rcx,%r12
+       rorq    $5,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       rorq    $4,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       addq    24(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       rorq    $6,%r14
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       rorq    $28,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       rorq    $23,%r13
+       movq    %r14,%r8
+       movq    %rbx,%r12
+       rorq    $5,%r14
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       rorq    $4,%r13
+       xorq    %r8,%r14
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    32(%rsp),%rdx
+       movq    %r8,%r15
+       xorq    %rcx,%r12
+       rorq    $6,%r14
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       xorq    %r9,%rdi
+       rorq    $28,%r14
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rdx
+       movq    %rax,%r12
+       rorq    $5,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       rorq    $4,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       addq    40(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       rorq    $6,%r14
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       rorq    $28,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rcx
+       movq    %r11,%r12
+       rorq    $5,%r14
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       rorq    $4,%r13
+       xorq    %rcx,%r14
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    48(%rsp),%rbx
+       movq    %rcx,%r15
+       xorq    %rax,%r12
+       rorq    $6,%r14
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       xorq    %rdx,%rdi
+       rorq    $28,%r14
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rbx
+       movq    %r10,%r12
+       rorq    $5,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       rorq    $4,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       addq    56(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       rorq    $6,%r14
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       rorq    $28,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       rorq    $23,%r13
+       movq    %r14,%rax
+       movq    %r9,%r12
+       rorq    $5,%r14
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       rorq    $4,%r13
+       xorq    %rax,%r14
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    64(%rsp),%r11
+       movq    %rax,%r15
+       xorq    %r10,%r12
+       rorq    $6,%r14
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       xorq    %rbx,%rdi
+       rorq    $28,%r14
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       rorq    $23,%r13
+       movq    %r14,%r11
+       movq    %r8,%r12
+       rorq    $5,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       rorq    $4,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       addq    72(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       rorq    $6,%r14
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       rorq    $28,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       rorq    $23,%r13
+       movq    %r14,%r10
+       movq    %rdx,%r12
+       rorq    $5,%r14
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       rorq    $4,%r13
+       xorq    %r10,%r14
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    80(%rsp),%r9
+       movq    %r10,%r15
+       xorq    %r8,%r12
+       rorq    $6,%r14
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       xorq    %r11,%rdi
+       rorq    $28,%r14
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       rorq    $23,%r13
+       movq    %r14,%r9
+       movq    %rcx,%r12
+       rorq    $5,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       rorq    $4,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       addq    88(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       rorq    $6,%r14
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       rorq    $28,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       rorq    $23,%r13
+       movq    %r14,%r8
+       movq    %rbx,%r12
+       rorq    $5,%r14
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       rorq    $4,%r13
+       xorq    %r8,%r14
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    96(%rsp),%rdx
+       movq    %r8,%r15
+       xorq    %rcx,%r12
+       rorq    $6,%r14
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       xorq    %r9,%rdi
+       rorq    $28,%r14
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rdx
+       movq    %rax,%r12
+       rorq    $5,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       rorq    $4,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       addq    104(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       rorq    $6,%r14
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       rorq    $28,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rcx
+       movq    %r11,%r12
+       rorq    $5,%r14
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       rorq    $4,%r13
+       xorq    %rcx,%r14
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    112(%rsp),%rbx
+       movq    %rcx,%r15
+       xorq    %rax,%r12
+       rorq    $6,%r14
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       xorq    %rdx,%rdi
+       rorq    $28,%r14
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rbx
+       movq    %r10,%r12
+       rorq    $5,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       rorq    $4,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       addq    120(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       rorq    $6,%r14
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       rorq    $28,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       movq    128+0(%rsp),%rdi
+       movq    %r14,%rax
+
+       addq    0(%rdi),%rax
+       leaq    128(%rsi),%rsi
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       addq    48(%rdi),%r10
+       addq    56(%rdi),%r11
+
+       cmpq    128+16(%rsp),%rsi
+
+       movq    %rax,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
+       jb      .Lloop_xop
+
+       movq    152(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       vzeroupper
+       movq    -48(%rsi),%r15
+.cfi_restore   %r15
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue_xop:
        .byte   0xf3,0xc3
-.size  sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3
-
+.cfi_endproc   
+.size  sha512_block_data_order_xop,.-sha512_block_data_order_xop
+.type  sha512_block_data_order_avx,@function
+.align 64
+sha512_block_data_order_avx:
+.cfi_startproc 
+.Lavx_shortcut:
+       movq    %rsp,%rax
+.cfi_def_cfa_register  %rax
+       pushq   %rbx
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_offset    %r15,-56
+       shlq    $4,%rdx
+       subq    $160,%rsp
+       leaq    (%rsi,%rdx,8),%rdx
+       andq    $-64,%rsp
+       movq    %rdi,128+0(%rsp)
+       movq    %rsi,128+8(%rsp)
+       movq    %rdx,128+16(%rsp)
+       movq    %rax,152(%rsp)
+.cfi_escape    0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_avx:
+
+       vzeroupper
+       movq    0(%rdi),%rax
+       movq    8(%rdi),%rbx
+       movq    16(%rdi),%rcx
+       movq    24(%rdi),%rdx
+       movq    32(%rdi),%r8
+       movq    40(%rdi),%r9
+       movq    48(%rdi),%r10
+       movq    56(%rdi),%r11
+       jmp     .Lloop_avx
+.align 16
+.Lloop_avx:
+       vmovdqa K512+1280(%rip),%xmm11
+       vmovdqu 0(%rsi),%xmm0
+       leaq    K512+128(%rip),%rbp
+       vmovdqu 16(%rsi),%xmm1
+       vmovdqu 32(%rsi),%xmm2
+       vpshufb %xmm11,%xmm0,%xmm0
+       vmovdqu 48(%rsi),%xmm3
+       vpshufb %xmm11,%xmm1,%xmm1
+       vmovdqu 64(%rsi),%xmm4
+       vpshufb %xmm11,%xmm2,%xmm2
+       vmovdqu 80(%rsi),%xmm5
+       vpshufb %xmm11,%xmm3,%xmm3
+       vmovdqu 96(%rsi),%xmm6
+       vpshufb %xmm11,%xmm4,%xmm4
+       vmovdqu 112(%rsi),%xmm7
+       vpshufb %xmm11,%xmm5,%xmm5
+       vpaddq  -128(%rbp),%xmm0,%xmm8
+       vpshufb %xmm11,%xmm6,%xmm6
+       vpaddq  -96(%rbp),%xmm1,%xmm9
+       vpshufb %xmm11,%xmm7,%xmm7
+       vpaddq  -64(%rbp),%xmm2,%xmm10
+       vpaddq  -32(%rbp),%xmm3,%xmm11
+       vmovdqa %xmm8,0(%rsp)
+       vpaddq  0(%rbp),%xmm4,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       vpaddq  32(%rbp),%xmm5,%xmm9
+       vmovdqa %xmm10,32(%rsp)
+       vpaddq  64(%rbp),%xmm6,%xmm10
+       vmovdqa %xmm11,48(%rsp)
+       vpaddq  96(%rbp),%xmm7,%xmm11
+       vmovdqa %xmm8,64(%rsp)
+       movq    %rax,%r14
+       vmovdqa %xmm9,80(%rsp)
+       movq    %rbx,%rdi
+       vmovdqa %xmm10,96(%rsp)
+       xorq    %rcx,%rdi
+       vmovdqa %xmm11,112(%rsp)
+       movq    %r8,%r13
+       jmp     .Lavx_00_47
 
-.section .note.GNU-stack,"",%progbits
+.align 16
+.Lavx_00_47:
+       addq    $256,%rbp
+       vpalignr        $8,%xmm0,%xmm1,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rax
+       vpalignr        $8,%xmm4,%xmm5,%xmm11
+       movq    %r9,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       vpaddq  %xmm11,%xmm0,%xmm0
+       shrdq   $4,%r13,%r13
+       xorq    %rax,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    0(%rsp),%r11
+       movq    %rax,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %r10,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rbx,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm7,%xmm11
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       vpsllq  $3,%xmm7,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r11
+       vpaddq  %xmm8,%xmm0,%xmm0
+       movq    %r8,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm7,%xmm9
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %r11,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    8(%rsp),%r10
+       movq    %r11,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %r9,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm0,%xmm0
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       vpaddq  -128(%rbp),%xmm0,%xmm10
+       xorq    %rax,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       vmovdqa %xmm10,0(%rsp)
+       vpalignr        $8,%xmm1,%xmm2,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r10
+       vpalignr        $8,%xmm5,%xmm6,%xmm11
+       movq    %rdx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       vpaddq  %xmm11,%xmm1,%xmm1
+       shrdq   $4,%r13,%r13
+       xorq    %r10,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    16(%rsp),%r9
+       movq    %r10,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %r8,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r11,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm0,%xmm11
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       vpsllq  $3,%xmm0,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r9
+       vpaddq  %xmm8,%xmm1,%xmm1
+       movq    %rcx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm0,%xmm9
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %r9,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    24(%rsp),%r8
+       movq    %r9,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %rdx,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm1,%xmm1
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       vpaddq  -96(%rbp),%xmm1,%xmm10
+       xorq    %r10,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       vmovdqa %xmm10,16(%rsp)
+       vpalignr        $8,%xmm2,%xmm3,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r8
+       vpalignr        $8,%xmm6,%xmm7,%xmm11
+       movq    %rbx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       vpaddq  %xmm11,%xmm2,%xmm2
+       shrdq   $4,%r13,%r13
+       xorq    %r8,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    32(%rsp),%rdx
+       movq    %r8,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %rcx,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r9,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm1,%xmm11
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       vpsllq  $3,%xmm1,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rdx
+       vpaddq  %xmm8,%xmm2,%xmm2
+       movq    %rax,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm1,%xmm9
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %rdx,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    40(%rsp),%rcx
+       movq    %rdx,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %rbx,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm2,%xmm2
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       vpaddq  -64(%rbp),%xmm2,%xmm10
+       xorq    %r8,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       vmovdqa %xmm10,32(%rsp)
+       vpalignr        $8,%xmm3,%xmm4,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rcx
+       vpalignr        $8,%xmm7,%xmm0,%xmm11
+       movq    %r11,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       vpaddq  %xmm11,%xmm3,%xmm3
+       shrdq   $4,%r13,%r13
+       xorq    %rcx,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    48(%rsp),%rbx
+       movq    %rcx,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %rax,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rdx,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm2,%xmm11
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       vpsllq  $3,%xmm2,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rbx
+       vpaddq  %xmm8,%xmm3,%xmm3
+       movq    %r10,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm2,%xmm9
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %rbx,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    56(%rsp),%rax
+       movq    %rbx,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %r11,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm3,%xmm3
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       vpaddq  -32(%rbp),%xmm3,%xmm10
+       xorq    %rcx,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       vmovdqa %xmm10,48(%rsp)
+       vpalignr        $8,%xmm4,%xmm5,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rax
+       vpalignr        $8,%xmm0,%xmm1,%xmm11
+       movq    %r9,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       vpaddq  %xmm11,%xmm4,%xmm4
+       shrdq   $4,%r13,%r13
+       xorq    %rax,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    64(%rsp),%r11
+       movq    %rax,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %r10,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rbx,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm3,%xmm11
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       vpsllq  $3,%xmm3,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r11
+       vpaddq  %xmm8,%xmm4,%xmm4
+       movq    %r8,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm3,%xmm9
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %r11,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    72(%rsp),%r10
+       movq    %r11,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %r9,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm4,%xmm4
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       vpaddq  0(%rbp),%xmm4,%xmm10
+       xorq    %rax,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       vmovdqa %xmm10,64(%rsp)
+       vpalignr        $8,%xmm5,%xmm6,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r10
+       vpalignr        $8,%xmm1,%xmm2,%xmm11
+       movq    %rdx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       vpaddq  %xmm11,%xmm5,%xmm5
+       shrdq   $4,%r13,%r13
+       xorq    %r10,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    80(%rsp),%r9
+       movq    %r10,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %r8,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r11,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm4,%xmm11
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       vpsllq  $3,%xmm4,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r9
+       vpaddq  %xmm8,%xmm5,%xmm5
+       movq    %rcx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm4,%xmm9
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %r9,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    88(%rsp),%r8
+       movq    %r9,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %rdx,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm5,%xmm5
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       vpaddq  32(%rbp),%xmm5,%xmm10
+       xorq    %r10,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       vmovdqa %xmm10,80(%rsp)
+       vpalignr        $8,%xmm6,%xmm7,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r8
+       vpalignr        $8,%xmm2,%xmm3,%xmm11
+       movq    %rbx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       vpaddq  %xmm11,%xmm6,%xmm6
+       shrdq   $4,%r13,%r13
+       xorq    %r8,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    96(%rsp),%rdx
+       movq    %r8,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %rcx,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r9,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm5,%xmm11
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       vpsllq  $3,%xmm5,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rdx
+       vpaddq  %xmm8,%xmm6,%xmm6
+       movq    %rax,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm5,%xmm9
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %rdx,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    104(%rsp),%rcx
+       movq    %rdx,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %rbx,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm6,%xmm6
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       vpaddq  64(%rbp),%xmm6,%xmm10
+       xorq    %r8,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       vmovdqa %xmm10,96(%rsp)
+       vpalignr        $8,%xmm7,%xmm0,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rcx
+       vpalignr        $8,%xmm3,%xmm4,%xmm11
+       movq    %r11,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       vpaddq  %xmm11,%xmm7,%xmm7
+       shrdq   $4,%r13,%r13
+       xorq    %rcx,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    112(%rsp),%rbx
+       movq    %rcx,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %rax,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rdx,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm6,%xmm11
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       vpsllq  $3,%xmm6,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rbx
+       vpaddq  %xmm8,%xmm7,%xmm7
+       movq    %r10,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm6,%xmm9
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %rbx,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    120(%rsp),%rax
+       movq    %rbx,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %r11,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm7,%xmm7
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       vpaddq  96(%rbp),%xmm7,%xmm10
+       xorq    %rcx,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       vmovdqa %xmm10,112(%rsp)
+       cmpb    $0,135(%rbp)
+       jne     .Lavx_00_47
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rax
+       movq    %r9,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rax,%r14
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    0(%rsp),%r11
+       movq    %rax,%r15
+       xorq    %r10,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       xorq    %rbx,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r11
+       movq    %r8,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       addq    8(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r10
+       movq    %rdx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r10,%r14
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    16(%rsp),%r9
+       movq    %r10,%r15
+       xorq    %r8,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       xorq    %r11,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r9
+       movq    %rcx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       addq    24(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r8
+       movq    %rbx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r8,%r14
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    32(%rsp),%rdx
+       movq    %r8,%r15
+       xorq    %rcx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       xorq    %r9,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rdx
+       movq    %rax,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       addq    40(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rcx
+       movq    %r11,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rcx,%r14
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    48(%rsp),%rbx
+       movq    %rcx,%r15
+       xorq    %rax,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       xorq    %rdx,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rbx
+       movq    %r10,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       addq    56(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rax
+       movq    %r9,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rax,%r14
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    64(%rsp),%r11
+       movq    %rax,%r15
+       xorq    %r10,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       xorq    %rbx,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r11
+       movq    %r8,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       addq    72(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r10
+       movq    %rdx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r10,%r14
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    80(%rsp),%r9
+       movq    %r10,%r15
+       xorq    %r8,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       xorq    %r11,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r9
+       movq    %rcx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       addq    88(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r8
+       movq    %rbx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r8,%r14
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    96(%rsp),%rdx
+       movq    %r8,%r15
+       xorq    %rcx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       xorq    %r9,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rdx
+       movq    %rax,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       addq    104(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rcx
+       movq    %r11,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rcx,%r14
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    112(%rsp),%rbx
+       movq    %rcx,%r15
+       xorq    %rax,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       xorq    %rdx,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rbx
+       movq    %r10,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       addq    120(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       movq    128+0(%rsp),%rdi
+       movq    %r14,%rax
+
+       addq    0(%rdi),%rax
+       leaq    128(%rsi),%rsi
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       addq    48(%rdi),%r10
+       addq    56(%rdi),%r11
+
+       cmpq    128+16(%rsp),%rsi
+
+       movq    %rax,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
+       jb      .Lloop_avx
+
+       movq    152(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       vzeroupper
+       movq    -48(%rsi),%r15
+.cfi_restore   %r15
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue_avx:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  sha512_block_data_order_avx,.-sha512_block_data_order_avx
+.type  sha512_block_data_order_avx2,@function
+.align 64
+sha512_block_data_order_avx2:
+.cfi_startproc 
+.Lavx2_shortcut:
+       movq    %rsp,%rax
+.cfi_def_cfa_register  %rax
+       pushq   %rbx
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_offset    %r15,-56
+       subq    $1312,%rsp
+       shlq    $4,%rdx
+       andq    $-2048,%rsp
+       leaq    (%rsi,%rdx,8),%rdx
+       addq    $1152,%rsp
+       movq    %rdi,128+0(%rsp)
+       movq    %rsi,128+8(%rsp)
+       movq    %rdx,128+16(%rsp)
+       movq    %rax,152(%rsp)
+.cfi_escape    0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_avx2:
+
+       vzeroupper
+       subq    $-128,%rsi
+       movq    0(%rdi),%rax
+       movq    %rsi,%r12
+       movq    8(%rdi),%rbx
+       cmpq    %rdx,%rsi
+       movq    16(%rdi),%rcx
+       cmoveq  %rsp,%r12
+       movq    24(%rdi),%rdx
+       movq    32(%rdi),%r8
+       movq    40(%rdi),%r9
+       movq    48(%rdi),%r10
+       movq    56(%rdi),%r11
+       jmp     .Loop_avx2
+.align 16
+.Loop_avx2:
+       vmovdqu -128(%rsi),%xmm0
+       vmovdqu -128+16(%rsi),%xmm1
+       vmovdqu -128+32(%rsi),%xmm2
+       leaq    K512+128(%rip),%rbp
+       vmovdqu -128+48(%rsi),%xmm3
+       vmovdqu -128+64(%rsi),%xmm4
+       vmovdqu -128+80(%rsi),%xmm5
+       vmovdqu -128+96(%rsi),%xmm6
+       vmovdqu -128+112(%rsi),%xmm7
+
+       vmovdqa 1152(%rbp),%ymm10
+       vinserti128     $1,(%r12),%ymm0,%ymm0
+       vinserti128     $1,16(%r12),%ymm1,%ymm1
+       vpshufb %ymm10,%ymm0,%ymm0
+       vinserti128     $1,32(%r12),%ymm2,%ymm2
+       vpshufb %ymm10,%ymm1,%ymm1
+       vinserti128     $1,48(%r12),%ymm3,%ymm3
+       vpshufb %ymm10,%ymm2,%ymm2
+       vinserti128     $1,64(%r12),%ymm4,%ymm4
+       vpshufb %ymm10,%ymm3,%ymm3
+       vinserti128     $1,80(%r12),%ymm5,%ymm5
+       vpshufb %ymm10,%ymm4,%ymm4
+       vinserti128     $1,96(%r12),%ymm6,%ymm6
+       vpshufb %ymm10,%ymm5,%ymm5
+       vinserti128     $1,112(%r12),%ymm7,%ymm7
+
+       vpaddq  -128(%rbp),%ymm0,%ymm8
+       vpshufb %ymm10,%ymm6,%ymm6
+       vpaddq  -96(%rbp),%ymm1,%ymm9
+       vpshufb %ymm10,%ymm7,%ymm7
+       vpaddq  -64(%rbp),%ymm2,%ymm10
+       vpaddq  -32(%rbp),%ymm3,%ymm11
+       vmovdqa %ymm8,0(%rsp)
+       vpaddq  0(%rbp),%ymm4,%ymm8
+       vmovdqa %ymm9,32(%rsp)
+       vpaddq  32(%rbp),%ymm5,%ymm9
+       vmovdqa %ymm10,64(%rsp)
+       vpaddq  64(%rbp),%ymm6,%ymm10
+       vmovdqa %ymm11,96(%rsp)
+       leaq    -128(%rsp),%rsp
+       vpaddq  96(%rbp),%ymm7,%ymm11
+       vmovdqa %ymm8,0(%rsp)
+       xorq    %r14,%r14
+       vmovdqa %ymm9,32(%rsp)
+       movq    %rbx,%rdi
+       vmovdqa %ymm10,64(%rsp)
+       xorq    %rcx,%rdi
+       vmovdqa %ymm11,96(%rsp)
+       movq    %r9,%r12
+       addq    $32*8,%rbp
+       jmp     .Lavx2_00_47
 
+.align 16
+.Lavx2_00_47:
+       leaq    -128(%rsp),%rsp
+       vpalignr        $8,%ymm0,%ymm1,%ymm8
+       addq    0+256(%rsp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       vpalignr        $8,%ymm4,%ymm5,%ymm11
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       vpaddq  %ymm11,%ymm0,%ymm0
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       vpsrlq  $6,%ymm7,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       vpsllq  $3,%ymm7,%ymm10
+       vpaddq  %ymm8,%ymm0,%ymm0
+       addq    8+256(%rsp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       vpsrlq  $19,%ymm7,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       vpaddq  %ymm11,%ymm0,%ymm0
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       vpaddq  -128(%rbp),%ymm0,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       vmovdqa %ymm10,0(%rsp)
+       vpalignr        $8,%ymm1,%ymm2,%ymm8
+       addq    32+256(%rsp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       vpalignr        $8,%ymm5,%ymm6,%ymm11
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       vpaddq  %ymm11,%ymm1,%ymm1
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       vpsrlq  $6,%ymm0,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       vpsllq  $3,%ymm0,%ymm10
+       vpaddq  %ymm8,%ymm1,%ymm1
+       addq    40+256(%rsp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       vpsrlq  $19,%ymm0,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       vpaddq  %ymm11,%ymm1,%ymm1
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       vpaddq  -96(%rbp),%ymm1,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       vmovdqa %ymm10,32(%rsp)
+       vpalignr        $8,%ymm2,%ymm3,%ymm8
+       addq    64+256(%rsp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       vpalignr        $8,%ymm6,%ymm7,%ymm11
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       vpaddq  %ymm11,%ymm2,%ymm2
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       vpsrlq  $6,%ymm1,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       vpsllq  $3,%ymm1,%ymm10
+       vpaddq  %ymm8,%ymm2,%ymm2
+       addq    72+256(%rsp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       vpsrlq  $19,%ymm1,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       vpaddq  %ymm11,%ymm2,%ymm2
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       vpaddq  -64(%rbp),%ymm2,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       vmovdqa %ymm10,64(%rsp)
+       vpalignr        $8,%ymm3,%ymm4,%ymm8
+       addq    96+256(%rsp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       vpalignr        $8,%ymm7,%ymm0,%ymm11
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       vpaddq  %ymm11,%ymm3,%ymm3
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       vpsrlq  $6,%ymm2,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       vpsllq  $3,%ymm2,%ymm10
+       vpaddq  %ymm8,%ymm3,%ymm3
+       addq    104+256(%rsp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       vpsrlq  $19,%ymm2,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       vpaddq  %ymm11,%ymm3,%ymm3
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       vpaddq  -32(%rbp),%ymm3,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       vmovdqa %ymm10,96(%rsp)
+       leaq    -128(%rsp),%rsp
+       vpalignr        $8,%ymm4,%ymm5,%ymm8
+       addq    0+256(%rsp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       vpalignr        $8,%ymm0,%ymm1,%ymm11
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       vpaddq  %ymm11,%ymm4,%ymm4
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       vpsrlq  $6,%ymm3,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       vpsllq  $3,%ymm3,%ymm10
+       vpaddq  %ymm8,%ymm4,%ymm4
+       addq    8+256(%rsp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       vpsrlq  $19,%ymm3,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       vpaddq  %ymm11,%ymm4,%ymm4
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       vpaddq  0(%rbp),%ymm4,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       vmovdqa %ymm10,0(%rsp)
+       vpalignr        $8,%ymm5,%ymm6,%ymm8
+       addq    32+256(%rsp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       vpalignr        $8,%ymm1,%ymm2,%ymm11
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       vpaddq  %ymm11,%ymm5,%ymm5
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       vpsrlq  $6,%ymm4,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       vpsllq  $3,%ymm4,%ymm10
+       vpaddq  %ymm8,%ymm5,%ymm5
+       addq    40+256(%rsp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       vpsrlq  $19,%ymm4,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       vpaddq  %ymm11,%ymm5,%ymm5
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       vpaddq  32(%rbp),%ymm5,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       vmovdqa %ymm10,32(%rsp)
+       vpalignr        $8,%ymm6,%ymm7,%ymm8
+       addq    64+256(%rsp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       vpalignr        $8,%ymm2,%ymm3,%ymm11
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       vpaddq  %ymm11,%ymm6,%ymm6
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       vpsrlq  $6,%ymm5,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       vpsllq  $3,%ymm5,%ymm10
+       vpaddq  %ymm8,%ymm6,%ymm6
+       addq    72+256(%rsp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       vpsrlq  $19,%ymm5,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       vpaddq  %ymm11,%ymm6,%ymm6
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       vpaddq  64(%rbp),%ymm6,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       vmovdqa %ymm10,64(%rsp)
+       vpalignr        $8,%ymm7,%ymm0,%ymm8
+       addq    96+256(%rsp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       vpalignr        $8,%ymm3,%ymm4,%ymm11
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       vpaddq  %ymm11,%ymm7,%ymm7
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       vpsrlq  $6,%ymm6,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       vpsllq  $3,%ymm6,%ymm10
+       vpaddq  %ymm8,%ymm7,%ymm7
+       addq    104+256(%rsp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       vpsrlq  $19,%ymm6,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       vpaddq  %ymm11,%ymm7,%ymm7
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       vpaddq  96(%rbp),%ymm7,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       vmovdqa %ymm10,96(%rsp)
+       leaq    256(%rbp),%rbp
+       cmpb    $0,-121(%rbp)
+       jne     .Lavx2_00_47
+       addq    0+128(%rsp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       addq    8+128(%rsp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       addq    32+128(%rsp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       addq    40+128(%rsp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       addq    64+128(%rsp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       addq    72+128(%rsp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       addq    96+128(%rsp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       addq    104+128(%rsp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       addq    0(%rsp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       addq    8(%rsp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       addq    32(%rsp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       addq    40(%rsp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       addq    64(%rsp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       addq    72(%rsp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       addq    96(%rsp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       addq    104(%rsp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       movq    1280(%rsp),%rdi
+       addq    %r14,%rax
+
+       leaq    1152(%rsp),%rbp
+
+       addq    0(%rdi),%rax
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       addq    48(%rdi),%r10
+       addq    56(%rdi),%r11
+
+       movq    %rax,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
+
+       cmpq    144(%rbp),%rsi
+       je      .Ldone_avx2
+
+       xorq    %r14,%r14
+       movq    %rbx,%rdi
+       xorq    %rcx,%rdi
+       movq    %r9,%r12
+       jmp     .Lower_avx2
+.align 16
+.Lower_avx2:
+       addq    0+16(%rbp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       addq    8+16(%rbp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       addq    32+16(%rbp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       addq    40+16(%rbp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       addq    64+16(%rbp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       addq    72+16(%rbp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       addq    96+16(%rbp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       addq    104+16(%rbp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       leaq    -128(%rbp),%rbp
+       cmpq    %rsp,%rbp
+       jae     .Lower_avx2
+
+       movq    1280(%rsp),%rdi
+       addq    %r14,%rax
+
+       leaq    1152(%rsp),%rsp
+
+       addq    0(%rdi),%rax
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       leaq    256(%rsi),%rsi
+       addq    48(%rdi),%r10
+       movq    %rsi,%r12
+       addq    56(%rdi),%r11
+       cmpq    128+16(%rsp),%rsi
+
+       movq    %rax,0(%rdi)
+       cmoveq  %rsp,%r12
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
+
+       jbe     .Loop_avx2
+       leaq    (%rsp),%rbp
+
+.Ldone_avx2:
+       leaq    (%rbp),%rsp
+       movq    152(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       vzeroupper
+       movq    -48(%rsi),%r15
+.cfi_restore   %r15
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue_avx2:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  sha512_block_data_order_avx2,.-sha512_block_data_order_avx2
 
+.section .note.GNU-stack,"",%progbits
index a134213922bd2a0584f3571b1876f87b4addd17b..cf688b3883f0136cde173fc656b67a56e6273214 100644 (file)
@@ -1,9 +1,9 @@
 X86_FILES_ELF=elf/aesni-x86.s elf/cpuid-x86.s elf/sha1-ssse3-x86.s elf/sha256-ssse3-x86.s elf/sha512-ssse3-x86.s elf/aes-ssse3-x86.s
 X86_FILES_COFF=coff/aesni-x86.s coff/cpuid-x86.s coff/sha1-ssse3-x86.s coff/sha256-ssse3-x86.s coff/sha512-ssse3-x86.s coff/aes-ssse3-x86.s
 X86_FILES_MACOSX=macosx/aesni-x86.s macosx/cpuid-x86.s macosx/sha1-ssse3-x86.s macosx/sha256-ssse3-x86.s macosx/sha512-ssse3-x86.s macosx/aes-ssse3-x86.s
-X86_64_FILES_ELF=elf/aesni-x86_64.s elf/cpuid-x86_64.s elf/ghash-x86_64.s elf/sha1-ssse3-x86_64.s elf/sha512-ssse3-x86_64.s elf/aes-ssse3-x86_64.s elf/aesni-gcm-x86_64.s
-X86_64_FILES_COFF=coff/aesni-x86_64.s coff/cpuid-x86_64.s coff/ghash-x86_64.s coff/sha1-ssse3-x86_64.s coff/sha512-ssse3-x86_64.s coff/aes-ssse3-x86_64.s coff/aesni-gcm-x86_64.s
-X86_64_FILES_MACOSX=macosx/aesni-x86_64.s macosx/cpuid-x86_64.s macosx/ghash-x86_64.s macosx/sha1-ssse3-x86_64.s macosx/sha512-ssse3-x86_64.s macosx/aes-ssse3-x86_64.s macosx/aesni-gcm-x86_64.s
+X86_64_FILES_ELF=elf/aesni-x86_64.s elf/cpuid-x86_64.s elf/ghash-x86_64.s elf/sha1-ssse3-x86_64.s elf/sha512-ssse3-x86_64.s elf/aes-ssse3-x86_64.s elf/aesni-gcm-x86_64.s elf/sha256-ssse3-x86_64.s
+X86_64_FILES_COFF=coff/aesni-x86_64.s coff/cpuid-x86_64.s coff/ghash-x86_64.s coff/sha1-ssse3-x86_64.s coff/sha512-ssse3-x86_64.s coff/aes-ssse3-x86_64.s coff/aesni-gcm-x86_64.s coff/sha256-ssse3-x86_64.s
+X86_64_FILES_MACOSX=macosx/aesni-x86_64.s macosx/cpuid-x86_64.s macosx/ghash-x86_64.s macosx/sha1-ssse3-x86_64.s macosx/sha512-ssse3-x86_64.s macosx/aes-ssse3-x86_64.s macosx/aesni-gcm-x86_64.s macosx/sha256-ssse3-x86_64.s
 X86_PADLOCK_FILES_ELF=elf/e_padlock-x86.s
 X86_PADLOCK_FILES_COFF=coff/e_padlock-x86.s
 X86_PADLOCK_FILES_MACOSX=macosx/e_padlock-x86.s
index c881a3d8880cf24991972ca866b4d572ca6a56a8..4be899281bbe7519fb60ae385cc83ada967dd7e0 100644 (file)
@@ -5,12 +5,11 @@
 ## By Mike Hamburg (Stanford University), 2009
 ## Public domain.
 ##
-## For details see https://shiftleft.org/papers/vector_aes/ and
-## https://crypto.stanford.edu/vpaes/.
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
 #
 # *** This file is auto-generated ***
 #
-.file  "vpaes-x86.s"
 .text
 .align 6,0x90
 L_vpaes_consts:
index 414bb483e0c1ebda4ce2e510b7247d81841c4099..3d5c652266ec65a3076c6fc61844d907708a3fad 100644 (file)
@@ -5,8 +5,8 @@
 ## By Mike Hamburg (Stanford University), 2009
 ## Public domain.
 ##
-## For details see https://shiftleft.org/papers/vector_aes/ and
-## https://crypto.stanford.edu/vpaes/.
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
 #
 # *** This file is auto-generated ***
 #
@@ -30,6 +30,7 @@
 
 .p2align       4
 _vpaes_encrypt_core:
+
        movq    %rdx,%r9
        movq    $16,%r11
        movl    240(%rdx),%eax
@@ -118,8 +119,10 @@ L$enc_entry:
 
 
 
+
 .p2align       4
 _vpaes_decrypt_core:
+
        movq    %rdx,%r9
        movl    240(%rdx),%eax
        movdqa  %xmm9,%xmm1
@@ -224,6 +227,7 @@ L$dec_entry:
 
 
 
+
 .p2align       4
 _vpaes_schedule_core:
 
@@ -231,6 +235,7 @@ _vpaes_schedule_core:
 
 
 
+
        call    _vpaes_preheat
        movdqa  L$k_rcon(%rip),%xmm8
        movdqu  (%rdi),%xmm0
@@ -409,8 +414,10 @@ L$schedule_mangle_last_dec:
 
 
 
+
 .p2align       4
 _vpaes_schedule_192_smear:
+
        pshufd  $0x80,%xmm6,%xmm1
        pshufd  $0xFE,%xmm7,%xmm0
        pxor    %xmm1,%xmm6
@@ -438,11 +445,13 @@ _vpaes_schedule_192_smear:
 
 
 
+
 
 
 .p2align       4
 _vpaes_schedule_round:
 
+
        pxor    %xmm1,%xmm1
 .byte  102,65,15,58,15,200,15
 .byte  102,69,15,58,15,192,15
@@ -507,8 +516,10 @@ _vpaes_schedule_low_round:
 
 
 
+
 .p2align       4
 _vpaes_schedule_transform:
+
        movdqa  %xmm9,%xmm1
        pandn   %xmm0,%xmm1
        psrld   $4,%xmm1
@@ -543,10 +554,12 @@ _vpaes_schedule_transform:
 
 
 
+
 
 
 .p2align       4
 _vpaes_schedule_mangle:
+
        movdqa  %xmm0,%xmm4
        movdqa  L$k_mc_forward(%rip),%xmm5
        testq   %rcx,%rcx
@@ -616,10 +629,12 @@ L$schedule_mangle_both:
 
 
 
+
 .globl _vpaes_set_encrypt_key
 
 .p2align       4
 _vpaes_set_encrypt_key:
+
        movl    %esi,%eax
        shrl    $5,%eax
        addl    $5,%eax
@@ -632,10 +647,12 @@ _vpaes_set_encrypt_key:
        .byte   0xf3,0xc3
 
 
+
 .globl _vpaes_set_decrypt_key
 
 .p2align       4
 _vpaes_set_decrypt_key:
+
        movl    %esi,%eax
        shrl    $5,%eax
        addl    $5,%eax
@@ -653,10 +670,12 @@ _vpaes_set_decrypt_key:
        .byte   0xf3,0xc3
 
 
+
 .globl _vpaes_encrypt
 
 .p2align       4
 _vpaes_encrypt:
+
        movdqu  (%rdi),%xmm0
        call    _vpaes_preheat
        call    _vpaes_encrypt_core
@@ -664,20 +683,24 @@ _vpaes_encrypt:
        .byte   0xf3,0xc3
 
 
+
 .globl _vpaes_decrypt
 
 .p2align       4
 _vpaes_decrypt:
+
        movdqu  (%rdi),%xmm0
        call    _vpaes_preheat
        call    _vpaes_decrypt_core
        movdqu  %xmm0,(%rsi)
        .byte   0xf3,0xc3
 
+
 .globl _vpaes_cbc_encrypt
 
 .p2align       4
 _vpaes_cbc_encrypt:
+
        xchgq   %rcx,%rdx
        subq    $16,%rcx
        jc      L$cbc_abort
@@ -721,8 +744,10 @@ L$cbc_abort:
 
 
 
+
 .p2align       4
 _vpaes_preheat:
+
        leaq    L$k_s0F(%rip),%r10
        movdqa  -32(%r10),%xmm10
        movdqa  -16(%r10),%xmm11
@@ -739,6 +764,7 @@ _vpaes_preheat:
 
 
 
+
 .p2align       6
 _vpaes_consts:
 L$k_inv:
index 002041cee2c464edea9f97cf73496a40e2f66347..d540930b5b2c3f01c0963e7c6ebe034be68a4e86 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -354,17 +354,25 @@ L$6x_done:
 
 .p2align       5
 _aesni_gcm_decrypt:
+
        xorq    %r10,%r10
        cmpq    $0x60,%rdx
        jb      L$gcm_dec_abort
 
        leaq    (%rsp),%rax
+
        pushq   %rbx
+
        pushq   %rbp
+
        pushq   %r12
+
        pushq   %r13
+
        pushq   %r14
+
        pushq   %r15
+
        vzeroupper
 
        vmovdqu (%r8),%xmm1
@@ -426,17 +434,25 @@ L$dec_no_key_aliasing:
 
        vzeroupper
        movq    -48(%rax),%r15
+
        movq    -40(%rax),%r14
+
        movq    -32(%rax),%r13
+
        movq    -24(%rax),%r12
+
        movq    -16(%rax),%rbp
+
        movq    -8(%rax),%rbx
+
        leaq    (%rax),%rsp
+
 L$gcm_dec_abort:
        movq    %r10,%rax
        .byte   0xf3,0xc3
 
 
+
 .p2align       5
 _aesni_ctr32_6x:
        vmovdqu 0-128(%rcx),%xmm4
@@ -531,17 +547,25 @@ L$handle_ctr32_2:
 
 .p2align       5
 _aesni_gcm_encrypt:
+
        xorq    %r10,%r10
        cmpq    $288,%rdx
        jb      L$gcm_enc_abort
 
        leaq    (%rsp),%rax
+
        pushq   %rbx
+
        pushq   %rbp
+
        pushq   %r12
+
        pushq   %r13
+
        pushq   %r14
+
        pushq   %r15
+
        vzeroupper
 
        vmovdqu (%r8),%xmm1
@@ -767,16 +791,24 @@ L$enc_no_key_aliasing:
 
        vzeroupper
        movq    -48(%rax),%r15
+
        movq    -40(%rax),%r14
+
        movq    -32(%rax),%r13
+
        movq    -24(%rax),%r12
+
        movq    -16(%rax),%rbp
+
        movq    -8(%rax),%rbx
+
        leaq    (%rax),%rsp
+
 L$gcm_enc_abort:
        movq    %r10,%rax
        .byte   0xf3,0xc3
 
+
 .p2align       6
 L$bswap_mask:
 .byte  15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
index 09ca1cbc5cf3cda9fb3758af4cc2a4099859a832..ee5008914625080460ec2f1c23a8a306ec733bbe 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,6 @@
 #
 # *** This file is auto-generated ***
 #
-.file  "devel/perlasm/aesni-x86.s"
 .text
 .globl _aesni_encrypt
 .align 4
@@ -59,7 +58,10 @@ L000enc1_loop_1:
        leal    16(%edx),%edx
        jnz     L000enc1_loop_1
 .byte  102,15,56,221,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%eax)
+       pxor    %xmm2,%xmm2
        ret
 .globl _aesni_decrypt
 .align 4
@@ -81,30 +83,84 @@ L001dec1_loop_2:
        leal    16(%edx),%edx
        jnz     L001dec1_loop_2
 .byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%eax)
+       pxor    %xmm2,%xmm2
+       ret
+.align 4
+__aesni_encrypt2:
+       movups  (%edx),%xmm0
+       shll    $4,%ecx
+       movups  16(%edx),%xmm1
+       xorps   %xmm0,%xmm2
+       pxor    %xmm0,%xmm3
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+       addl    $16,%ecx
+L002enc2_loop:
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     L002enc2_loop
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,221,208
+.byte  102,15,56,221,216
+       ret
+.align 4
+__aesni_decrypt2:
+       movups  (%edx),%xmm0
+       shll    $4,%ecx
+       movups  16(%edx),%xmm1
+       xorps   %xmm0,%xmm2
+       pxor    %xmm0,%xmm3
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+       addl    $16,%ecx
+L003dec2_loop:
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     L003dec2_loop
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,223,208
+.byte  102,15,56,223,216
        ret
 .align 4
 __aesni_encrypt3:
        movups  (%edx),%xmm0
-       shrl    $1,%ecx
+       shll    $4,%ecx
        movups  16(%edx),%xmm1
-       leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
        pxor    %xmm0,%xmm4
-       movups  (%edx),%xmm0
-L002enc3_loop:
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+       addl    $16,%ecx
+L004enc3_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       decl    %ecx
 .byte  102,15,56,220,225
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       leal    32(%edx),%edx
 .byte  102,15,56,220,224
-       movups  (%edx),%xmm0
-       jnz     L002enc3_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     L004enc3_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
@@ -115,25 +171,26 @@ L002enc3_loop:
 .align 4
 __aesni_decrypt3:
        movups  (%edx),%xmm0
-       shrl    $1,%ecx
+       shll    $4,%ecx
        movups  16(%edx),%xmm1
-       leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
        pxor    %xmm0,%xmm4
-       movups  (%edx),%xmm0
-L003dec3_loop:
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+       addl    $16,%ecx
+L005dec3_loop:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
-       decl    %ecx
 .byte  102,15,56,222,225
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,222,208
 .byte  102,15,56,222,216
-       leal    32(%edx),%edx
 .byte  102,15,56,222,224
-       movups  (%edx),%xmm0
-       jnz     L003dec3_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     L005dec3_loop
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
@@ -145,27 +202,29 @@ L003dec3_loop:
 __aesni_encrypt4:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
-       shrl    $1,%ecx
-       leal    32(%edx),%edx
+       shll    $4,%ecx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
        pxor    %xmm0,%xmm4
        pxor    %xmm0,%xmm5
-       movups  (%edx),%xmm0
-L004enc4_loop:
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+.byte  15,31,64,0
+       addl    $16,%ecx
+L006enc4_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       decl    %ecx
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       leal    32(%edx),%edx
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
-       movups  (%edx),%xmm0
-       jnz     L004enc4_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     L006enc4_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
@@ -179,27 +238,29 @@ L004enc4_loop:
 __aesni_decrypt4:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
-       shrl    $1,%ecx
-       leal    32(%edx),%edx
+       shll    $4,%ecx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
        pxor    %xmm0,%xmm4
        pxor    %xmm0,%xmm5
-       movups  (%edx),%xmm0
-L005dec4_loop:
+       movups  32(%edx),%xmm0
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+.byte  15,31,64,0
+       addl    $16,%ecx
+L007dec4_loop:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
-       decl    %ecx
 .byte  102,15,56,222,225
 .byte  102,15,56,222,233
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,222,208
 .byte  102,15,56,222,216
-       leal    32(%edx),%edx
 .byte  102,15,56,222,224
 .byte  102,15,56,222,232
-       movups  (%edx),%xmm0
-       jnz     L005dec4_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     L007dec4_loop
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
@@ -212,45 +273,42 @@ L005dec4_loop:
 .align 4
 __aesni_encrypt6:
        movups  (%edx),%xmm0
-       shrl    $1,%ecx
+       shll    $4,%ecx
        movups  16(%edx),%xmm1
-       leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
-.byte  102,15,56,220,209
        pxor    %xmm0,%xmm4
-.byte  102,15,56,220,217
+.byte  102,15,56,220,209
        pxor    %xmm0,%xmm5
-       decl    %ecx
-.byte  102,15,56,220,225
        pxor    %xmm0,%xmm6
-.byte  102,15,56,220,233
+.byte  102,15,56,220,217
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+.byte  102,15,56,220,225
        pxor    %xmm0,%xmm7
-.byte  102,15,56,220,241
-       movups  (%edx),%xmm0
-.byte  102,15,56,220,249
-       jmp     L_aesni_encrypt6_enter
+       movups  (%edx,%ecx,1),%xmm0
+       addl    $16,%ecx
+       jmp     L008_aesni_encrypt6_inner
 .align 4,0x90
-L006enc6_loop:
+L009enc6_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       decl    %ecx
 .byte  102,15,56,220,225
+L008_aesni_encrypt6_inner:
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
-.align 4,0x90
 L_aesni_encrypt6_enter:
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       leal    32(%edx),%edx
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
 .byte  102,15,56,220,240
 .byte  102,15,56,220,248
-       movups  (%edx),%xmm0
-       jnz     L006enc6_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     L009enc6_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
@@ -267,45 +325,42 @@ L_aesni_encrypt6_enter:
 .align 4
 __aesni_decrypt6:
        movups  (%edx),%xmm0
-       shrl    $1,%ecx
+       shll    $4,%ecx
        movups  16(%edx),%xmm1
-       leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
-.byte  102,15,56,222,209
        pxor    %xmm0,%xmm4
-.byte  102,15,56,222,217
+.byte  102,15,56,222,209
        pxor    %xmm0,%xmm5
-       decl    %ecx
-.byte  102,15,56,222,225
        pxor    %xmm0,%xmm6
-.byte  102,15,56,222,233
+.byte  102,15,56,222,217
+       leal    32(%edx,%ecx,1),%edx
+       negl    %ecx
+.byte  102,15,56,222,225
        pxor    %xmm0,%xmm7
-.byte  102,15,56,222,241
-       movups  (%edx),%xmm0
-.byte  102,15,56,222,249
-       jmp     L_aesni_decrypt6_enter
+       movups  (%edx,%ecx,1),%xmm0
+       addl    $16,%ecx
+       jmp     L010_aesni_decrypt6_inner
 .align 4,0x90
-L007dec6_loop:
+L011dec6_loop:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
-       decl    %ecx
 .byte  102,15,56,222,225
+L010_aesni_decrypt6_inner:
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
 .byte  102,15,56,222,249
-.align 4,0x90
 L_aesni_decrypt6_enter:
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,222,208
 .byte  102,15,56,222,216
-       leal    32(%edx),%edx
 .byte  102,15,56,222,224
 .byte  102,15,56,222,232
 .byte  102,15,56,222,240
 .byte  102,15,56,222,248
-       movups  (%edx),%xmm0
-       jnz     L007dec6_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     L011dec6_loop
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
@@ -333,14 +388,14 @@ L_aesni_ecb_encrypt_begin:
        movl    32(%esp),%edx
        movl    36(%esp),%ebx
        andl    $-16,%eax
-       jz      L008ecb_ret
+       jz      L012ecb_ret
        movl    240(%edx),%ecx
        testl   %ebx,%ebx
-       jz      L009ecb_decrypt
+       jz      L013ecb_decrypt
        movl    %edx,%ebp
        movl    %ecx,%ebx
        cmpl    $96,%eax
-       jb      L010ecb_enc_tail
+       jb      L014ecb_enc_tail
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -349,9 +404,9 @@ L_aesni_ecb_encrypt_begin:
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
        subl    $96,%eax
-       jmp     L011ecb_enc_loop6_enter
+       jmp     L015ecb_enc_loop6_enter
 .align 4,0x90
-L012ecb_enc_loop6:
+L016ecb_enc_loop6:
        movups  %xmm2,(%edi)
        movdqu  (%esi),%xmm2
        movups  %xmm3,16(%edi)
@@ -366,12 +421,12 @@ L012ecb_enc_loop6:
        leal    96(%edi),%edi
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
-L011ecb_enc_loop6_enter:
+L015ecb_enc_loop6_enter:
        call    __aesni_encrypt6
        movl    %ebp,%edx
        movl    %ebx,%ecx
        subl    $96,%eax
-       jnc     L012ecb_enc_loop6
+       jnc     L016ecb_enc_loop6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
@@ -380,18 +435,18 @@ L011ecb_enc_loop6_enter:
        movups  %xmm7,80(%edi)
        leal    96(%edi),%edi
        addl    $96,%eax
-       jz      L008ecb_ret
-L010ecb_enc_tail:
+       jz      L012ecb_ret
+L014ecb_enc_tail:
        movups  (%esi),%xmm2
        cmpl    $32,%eax
-       jb      L013ecb_enc_one
+       jb      L017ecb_enc_one
        movups  16(%esi),%xmm3
-       je      L014ecb_enc_two
+       je      L018ecb_enc_two
        movups  32(%esi),%xmm4
        cmpl    $64,%eax
-       jb      L015ecb_enc_three
+       jb      L019ecb_enc_three
        movups  48(%esi),%xmm5
-       je      L016ecb_enc_four
+       je      L020ecb_enc_four
        movups  64(%esi),%xmm6
        xorps   %xmm7,%xmm7
        call    __aesni_encrypt6
@@ -400,50 +455,49 @@ L010ecb_enc_tail:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     L008ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L013ecb_enc_one:
+L017ecb_enc_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L017enc1_loop_3:
+L021enc1_loop_3:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L017enc1_loop_3
+       jnz     L021enc1_loop_3
 .byte  102,15,56,221,209
        movups  %xmm2,(%edi)
-       jmp     L008ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L014ecb_enc_two:
-       xorps   %xmm4,%xmm4
-       call    __aesni_encrypt3
+L018ecb_enc_two:
+       call    __aesni_encrypt2
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     L008ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L015ecb_enc_three:
+L019ecb_enc_three:
        call    __aesni_encrypt3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     L008ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L016ecb_enc_four:
+L020ecb_enc_four:
        call    __aesni_encrypt4
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-       jmp     L008ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L009ecb_decrypt:
+L013ecb_decrypt:
        movl    %edx,%ebp
        movl    %ecx,%ebx
        cmpl    $96,%eax
-       jb      L018ecb_dec_tail
+       jb      L022ecb_dec_tail
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -452,9 +506,9 @@ L009ecb_decrypt:
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
        subl    $96,%eax
-       jmp     L019ecb_dec_loop6_enter
+       jmp     L023ecb_dec_loop6_enter
 .align 4,0x90
-L020ecb_dec_loop6:
+L024ecb_dec_loop6:
        movups  %xmm2,(%edi)
        movdqu  (%esi),%xmm2
        movups  %xmm3,16(%edi)
@@ -469,12 +523,12 @@ L020ecb_dec_loop6:
        leal    96(%edi),%edi
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
-L019ecb_dec_loop6_enter:
+L023ecb_dec_loop6_enter:
        call    __aesni_decrypt6
        movl    %ebp,%edx
        movl    %ebx,%ecx
        subl    $96,%eax
-       jnc     L020ecb_dec_loop6
+       jnc     L024ecb_dec_loop6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
@@ -483,18 +537,18 @@ L019ecb_dec_loop6_enter:
        movups  %xmm7,80(%edi)
        leal    96(%edi),%edi
        addl    $96,%eax
-       jz      L008ecb_ret
-L018ecb_dec_tail:
+       jz      L012ecb_ret
+L022ecb_dec_tail:
        movups  (%esi),%xmm2
        cmpl    $32,%eax
-       jb      L021ecb_dec_one
+       jb      L025ecb_dec_one
        movups  16(%esi),%xmm3
-       je      L022ecb_dec_two
+       je      L026ecb_dec_two
        movups  32(%esi),%xmm4
        cmpl    $64,%eax
-       jb      L023ecb_dec_three
+       jb      L027ecb_dec_three
        movups  48(%esi),%xmm5
-       je      L024ecb_dec_four
+       je      L028ecb_dec_four
        movups  64(%esi),%xmm6
        xorps   %xmm7,%xmm7
        call    __aesni_decrypt6
@@ -503,44 +557,51 @@ L018ecb_dec_tail:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     L008ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L021ecb_dec_one:
+L025ecb_dec_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L025dec1_loop_4:
+L029dec1_loop_4:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L025dec1_loop_4
+       jnz     L029dec1_loop_4
 .byte  102,15,56,223,209
        movups  %xmm2,(%edi)
-       jmp     L008ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L022ecb_dec_two:
-       xorps   %xmm4,%xmm4
-       call    __aesni_decrypt3
+L026ecb_dec_two:
+       call    __aesni_decrypt2
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     L008ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L023ecb_dec_three:
+L027ecb_dec_three:
        call    __aesni_decrypt3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     L008ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L024ecb_dec_four:
+L028ecb_dec_four:
        call    __aesni_decrypt4
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-L008ecb_ret:
+L012ecb_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -577,48 +638,56 @@ L_aesni_ccm64_encrypt_blocks_begin:
        movl    %ebp,20(%esp)
        movl    %ebp,24(%esp)
        movl    %ebp,28(%esp)
-       shrl    $1,%ecx
+       shll    $4,%ecx
+       movl    $16,%ebx
        leal    (%edx),%ebp
        movdqa  (%esp),%xmm5
        movdqa  %xmm7,%xmm2
-       movl    %ecx,%ebx
+       leal    32(%edx,%ecx,1),%edx
+       subl    %ecx,%ebx
 .byte  102,15,56,0,253
-L026ccm64_enc_outer:
+L030ccm64_enc_outer:
        movups  (%ebp),%xmm0
        movl    %ebx,%ecx
        movups  (%esi),%xmm6
        xorps   %xmm0,%xmm2
        movups  16(%ebp),%xmm1
        xorps   %xmm6,%xmm0
-       leal    32(%ebp),%edx
        xorps   %xmm0,%xmm3
-       movups  (%edx),%xmm0
-L027ccm64_enc2_loop:
+       movups  32(%ebp),%xmm0
+L031ccm64_enc2_loop:
 .byte  102,15,56,220,209
-       decl    %ecx
 .byte  102,15,56,220,217
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,220,208
-       leal    32(%edx),%edx
 .byte  102,15,56,220,216
-       movups  (%edx),%xmm0
-       jnz     L027ccm64_enc2_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     L031ccm64_enc2_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
        paddq   16(%esp),%xmm7
+       decl    %eax
 .byte  102,15,56,221,208
 .byte  102,15,56,221,216
-       decl    %eax
        leal    16(%esi),%esi
        xorps   %xmm2,%xmm6
        movdqa  %xmm7,%xmm2
        movups  %xmm6,(%edi)
-       leal    16(%edi),%edi
 .byte  102,15,56,0,213
-       jnz     L026ccm64_enc_outer
+       leal    16(%edi),%edi
+       jnz     L030ccm64_enc_outer
        movl    48(%esp),%esp
        movl    40(%esp),%edi
        movups  %xmm3,(%edi)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -664,71 +733,82 @@ L_aesni_ccm64_decrypt_blocks_begin:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L028enc1_loop_5:
+L032enc1_loop_5:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L028enc1_loop_5
+       jnz     L032enc1_loop_5
 .byte  102,15,56,221,209
+       shll    $4,%ebx
+       movl    $16,%ecx
        movups  (%esi),%xmm6
        paddq   16(%esp),%xmm7
        leal    16(%esi),%esi
-       jmp     L029ccm64_dec_outer
+       subl    %ebx,%ecx
+       leal    32(%ebp,%ebx,1),%edx
+       movl    %ecx,%ebx
+       jmp     L033ccm64_dec_outer
 .align 4,0x90
-L029ccm64_dec_outer:
+L033ccm64_dec_outer:
        xorps   %xmm2,%xmm6
        movdqa  %xmm7,%xmm2
-       movl    %ebx,%ecx
        movups  %xmm6,(%edi)
        leal    16(%edi),%edi
 .byte  102,15,56,0,213
        subl    $1,%eax
-       jz      L030ccm64_dec_break
+       jz      L034ccm64_dec_break
        movups  (%ebp),%xmm0
-       shrl    $1,%ecx
+       movl    %ebx,%ecx
        movups  16(%ebp),%xmm1
        xorps   %xmm0,%xmm6
-       leal    32(%ebp),%edx
        xorps   %xmm0,%xmm2
        xorps   %xmm6,%xmm3
-       movups  (%edx),%xmm0
-L031ccm64_dec2_loop:
+       movups  32(%ebp),%xmm0
+L035ccm64_dec2_loop:
 .byte  102,15,56,220,209
-       decl    %ecx
 .byte  102,15,56,220,217
-       movups  16(%edx),%xmm1
+       movups  (%edx,%ecx,1),%xmm1
+       addl    $32,%ecx
 .byte  102,15,56,220,208
-       leal    32(%edx),%edx
 .byte  102,15,56,220,216
-       movups  (%edx),%xmm0
-       jnz     L031ccm64_dec2_loop
+       movups  -16(%edx,%ecx,1),%xmm0
+       jnz     L035ccm64_dec2_loop
        movups  (%esi),%xmm6
        paddq   16(%esp),%xmm7
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       leal    16(%esi),%esi
 .byte  102,15,56,221,208
 .byte  102,15,56,221,216
-       jmp     L029ccm64_dec_outer
+       leal    16(%esi),%esi
+       jmp     L033ccm64_dec_outer
 .align 4,0x90
-L030ccm64_dec_break:
+L034ccm64_dec_break:
+       movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        xorps   %xmm0,%xmm6
        leal    32(%edx),%edx
        xorps   %xmm6,%xmm3
-L032enc1_loop_6:
+L036enc1_loop_6:
 .byte  102,15,56,220,217
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L032enc1_loop_6
+       jnz     L036enc1_loop_6
 .byte  102,15,56,221,217
        movl    48(%esp),%esp
        movl    40(%esp),%edi
        movups  %xmm3,(%edi)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -752,7 +832,7 @@ L_aesni_ctr32_encrypt_blocks_begin:
        andl    $-16,%esp
        movl    %ebp,80(%esp)
        cmpl    $1,%eax
-       je      L033ctr32_one_shortcut
+       je      L037ctr32_one_shortcut
        movdqu  (%ebx),%xmm7
        movl    $202182159,(%esp)
        movl    $134810123,4(%esp)
@@ -768,63 +848,59 @@ L_aesni_ctr32_encrypt_blocks_begin:
 .byte  102,15,58,34,253,3
        movl    240(%edx),%ecx
        bswap   %ebx
-       pxor    %xmm1,%xmm1
        pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movdqa  (%esp),%xmm2
-.byte  102,15,58,34,203,0
+.byte  102,15,58,34,195,0
        leal    3(%ebx),%ebp
-.byte  102,15,58,34,197,0
+.byte  102,15,58,34,205,0
        incl    %ebx
-.byte  102,15,58,34,203,1
+.byte  102,15,58,34,195,1
        incl    %ebp
-.byte  102,15,58,34,197,1
+.byte  102,15,58,34,205,1
        incl    %ebx
-.byte  102,15,58,34,203,2
+.byte  102,15,58,34,195,2
        incl    %ebp
-.byte  102,15,58,34,197,2
-       movdqa  %xmm1,48(%esp)
-.byte  102,15,56,0,202
-       movdqa  %xmm0,64(%esp)
+.byte  102,15,58,34,205,2
+       movdqa  %xmm0,48(%esp)
 .byte  102,15,56,0,194
-       pshufd  $192,%xmm1,%xmm2
-       pshufd  $128,%xmm1,%xmm3
+       movdqu  (%edx),%xmm6
+       movdqa  %xmm1,64(%esp)
+.byte  102,15,56,0,202
+       pshufd  $192,%xmm0,%xmm2
+       pshufd  $128,%xmm0,%xmm3
        cmpl    $6,%eax
-       jb      L034ctr32_tail
+       jb      L038ctr32_tail
+       pxor    %xmm6,%xmm7
+       shll    $4,%ecx
+       movl    $16,%ebx
        movdqa  %xmm7,32(%esp)
-       shrl    $1,%ecx
        movl    %edx,%ebp
-       movl    %ecx,%ebx
+       subl    %ecx,%ebx
+       leal    32(%edx,%ecx,1),%edx
        subl    $6,%eax
-       jmp     L035ctr32_loop6
+       jmp     L039ctr32_loop6
 .align 4,0x90
-L035ctr32_loop6:
-       pshufd  $64,%xmm1,%xmm4
-       movdqa  32(%esp),%xmm1
-       pshufd  $192,%xmm0,%xmm5
-       por     %xmm1,%xmm2
-       pshufd  $128,%xmm0,%xmm6
-       por     %xmm1,%xmm3
-       pshufd  $64,%xmm0,%xmm7
-       por     %xmm1,%xmm4
-       por     %xmm1,%xmm5
-       por     %xmm1,%xmm6
-       por     %xmm1,%xmm7
-       movups  (%ebp),%xmm0
-       movups  16(%ebp),%xmm1
-       leal    32(%ebp),%edx
-       decl    %ecx
+L039ctr32_loop6:
+       pshufd  $64,%xmm0,%xmm4
+       movdqa  32(%esp),%xmm0
+       pshufd  $192,%xmm1,%xmm5
        pxor    %xmm0,%xmm2
+       pshufd  $128,%xmm1,%xmm6
        pxor    %xmm0,%xmm3
-.byte  102,15,56,220,209
+       pshufd  $64,%xmm1,%xmm7
+       movups  16(%ebp),%xmm1
        pxor    %xmm0,%xmm4
-.byte  102,15,56,220,217
        pxor    %xmm0,%xmm5
-.byte  102,15,56,220,225
+.byte  102,15,56,220,209
        pxor    %xmm0,%xmm6
-.byte  102,15,56,220,233
        pxor    %xmm0,%xmm7
+.byte  102,15,56,220,217
+       movups  32(%ebp),%xmm0
+       movl    %ebx,%ecx
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
 .byte  102,15,56,220,241
-       movups  (%edx),%xmm0
 .byte  102,15,56,220,249
        call    L_aesni_encrypt6_enter
        movups  (%esi),%xmm1
@@ -835,51 +911,51 @@ L035ctr32_loop6:
        movups  %xmm2,(%edi)
        movdqa  16(%esp),%xmm0
        xorps   %xmm1,%xmm4
-       movdqa  48(%esp),%xmm1
+       movdqa  64(%esp),%xmm1
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
        paddd   %xmm0,%xmm1
-       paddd   64(%esp),%xmm0
+       paddd   48(%esp),%xmm0
        movdqa  (%esp),%xmm2
        movups  48(%esi),%xmm3
        movups  64(%esi),%xmm4
        xorps   %xmm3,%xmm5
        movups  80(%esi),%xmm3
        leal    96(%esi),%esi
-       movdqa  %xmm1,48(%esp)
-.byte  102,15,56,0,202
+       movdqa  %xmm0,48(%esp)
+.byte  102,15,56,0,194
        xorps   %xmm4,%xmm6
        movups  %xmm5,48(%edi)
        xorps   %xmm3,%xmm7
-       movdqa  %xmm0,64(%esp)
-.byte  102,15,56,0,194
+       movdqa  %xmm1,64(%esp)
+.byte  102,15,56,0,202
        movups  %xmm6,64(%edi)
-       pshufd  $192,%xmm1,%xmm2
+       pshufd  $192,%xmm0,%xmm2
        movups  %xmm7,80(%edi)
        leal    96(%edi),%edi
-       movl    %ebx,%ecx
-       pshufd  $128,%xmm1,%xmm3
+       pshufd  $128,%xmm0,%xmm3
        subl    $6,%eax
-       jnc     L035ctr32_loop6
+       jnc     L039ctr32_loop6
        addl    $6,%eax
-       jz      L036ctr32_ret
+       jz      L040ctr32_ret
+       movdqu  (%ebp),%xmm7
        movl    %ebp,%edx
-       leal    1(,%ecx,2),%ecx
-       movdqa  32(%esp),%xmm7
-L034ctr32_tail:
+       pxor    32(%esp),%xmm7
+       movl    240(%ebp),%ecx
+L038ctr32_tail:
        por     %xmm7,%xmm2
        cmpl    $2,%eax
-       jb      L037ctr32_one
-       pshufd  $64,%xmm1,%xmm4
+       jb      L041ctr32_one
+       pshufd  $64,%xmm0,%xmm4
        por     %xmm7,%xmm3
-       je      L038ctr32_two
-       pshufd  $192,%xmm0,%xmm5
+       je      L042ctr32_two
+       pshufd  $192,%xmm1,%xmm5
        por     %xmm7,%xmm4
        cmpl    $4,%eax
-       jb      L039ctr32_three
-       pshufd  $128,%xmm0,%xmm6
+       jb      L043ctr32_three
+       pshufd  $128,%xmm1,%xmm6
        por     %xmm7,%xmm5
-       je      L040ctr32_four
+       je      L044ctr32_four
        por     %xmm7,%xmm6
        call    __aesni_encrypt6
        movups  (%esi),%xmm1
@@ -897,39 +973,39 @@ L034ctr32_tail:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     L036ctr32_ret
+       jmp     L040ctr32_ret
 .align 4,0x90
-L033ctr32_one_shortcut:
+L037ctr32_one_shortcut:
        movups  (%ebx),%xmm2
        movl    240(%edx),%ecx
-L037ctr32_one:
+L041ctr32_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L041enc1_loop_7:
+L045enc1_loop_7:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L041enc1_loop_7
+       jnz     L045enc1_loop_7
 .byte  102,15,56,221,209
        movups  (%esi),%xmm6
        xorps   %xmm2,%xmm6
        movups  %xmm6,(%edi)
-       jmp     L036ctr32_ret
+       jmp     L040ctr32_ret
 .align 4,0x90
-L038ctr32_two:
-       call    __aesni_encrypt3
+L042ctr32_two:
+       call    __aesni_encrypt2
        movups  (%esi),%xmm5
        movups  16(%esi),%xmm6
        xorps   %xmm5,%xmm2
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     L036ctr32_ret
+       jmp     L040ctr32_ret
 .align 4,0x90
-L039ctr32_three:
+L043ctr32_three:
        call    __aesni_encrypt3
        movups  (%esi),%xmm5
        movups  16(%esi),%xmm6
@@ -940,9 +1016,9 @@ L039ctr32_three:
        xorps   %xmm7,%xmm4
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     L036ctr32_ret
+       jmp     L040ctr32_ret
 .align 4,0x90
-L040ctr32_four:
+L044ctr32_four:
        call    __aesni_encrypt4
        movups  (%esi),%xmm6
        movups  16(%esi),%xmm7
@@ -956,7 +1032,18 @@ L040ctr32_four:
        xorps   %xmm0,%xmm5
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-L036ctr32_ret:
+L040ctr32_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
        movl    80(%esp),%esp
        popl    %edi
        popl    %esi
@@ -979,12 +1066,12 @@ L_aesni_xts_encrypt_begin:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L042enc1_loop_8:
+L046enc1_loop_8:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L042enc1_loop_8
+       jnz     L046enc1_loop_8
 .byte  102,15,56,221,209
        movl    20(%esp),%esi
        movl    24(%esp),%edi
@@ -1008,12 +1095,14 @@ L042enc1_loop_8:
        movl    %edx,%ebp
        movl    %ecx,%ebx
        subl    $96,%eax
-       jc      L043xts_enc_short
-       shrl    $1,%ecx
-       movl    %ecx,%ebx
-       jmp     L044xts_enc_loop6
+       jc      L047xts_enc_short
+       shll    $4,%ecx
+       movl    $16,%ebx
+       subl    %ecx,%ebx
+       leal    32(%edx,%ecx,1),%edx
+       jmp     L048xts_enc_loop6
 .align 4,0x90
-L044xts_enc_loop6:
+L048xts_enc_loop6:
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,(%esp)
@@ -1049,6 +1138,7 @@ L044xts_enc_loop6:
        pand    %xmm3,%xmm7
        movups  (%esi),%xmm2
        pxor    %xmm1,%xmm7
+       movl    %ebx,%ecx
        movdqu  16(%esi),%xmm3
        xorps   %xmm0,%xmm2
        movdqu  32(%esi),%xmm4
@@ -1064,19 +1154,17 @@ L044xts_enc_loop6:
        movdqa  %xmm7,80(%esp)
        pxor    %xmm1,%xmm7
        movups  16(%ebp),%xmm1
-       leal    32(%ebp),%edx
        pxor    16(%esp),%xmm3
-.byte  102,15,56,220,209
        pxor    32(%esp),%xmm4
-.byte  102,15,56,220,217
+.byte  102,15,56,220,209
        pxor    48(%esp),%xmm5
-       decl    %ecx
-.byte  102,15,56,220,225
        pxor    64(%esp),%xmm6
-.byte  102,15,56,220,233
+.byte  102,15,56,220,217
        pxor    %xmm0,%xmm7
+       movups  32(%ebp),%xmm0
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
 .byte  102,15,56,220,241
-       movups  (%edx),%xmm0
 .byte  102,15,56,220,249
        call    L_aesni_encrypt6_enter
        movdqa  80(%esp),%xmm1
@@ -1101,26 +1189,25 @@ L044xts_enc_loop6:
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
-       movl    %ebx,%ecx
        pxor    %xmm2,%xmm1
        subl    $96,%eax
-       jnc     L044xts_enc_loop6
-       leal    1(,%ecx,2),%ecx
+       jnc     L048xts_enc_loop6
+       movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movl    %ecx,%ebx
-L043xts_enc_short:
+L047xts_enc_short:
        addl    $96,%eax
-       jz      L045xts_enc_done6x
+       jz      L049xts_enc_done6x
        movdqa  %xmm1,%xmm5
        cmpl    $32,%eax
-       jb      L046xts_enc_one
+       jb      L050xts_enc_one
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-       je      L047xts_enc_two
+       je      L051xts_enc_two
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm6
@@ -1129,7 +1216,7 @@ L043xts_enc_short:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        cmpl    $64,%eax
-       jb      L048xts_enc_three
+       jb      L052xts_enc_three
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm7
@@ -1139,7 +1226,7 @@ L043xts_enc_short:
        pxor    %xmm2,%xmm1
        movdqa  %xmm5,(%esp)
        movdqa  %xmm6,16(%esp)
-       je      L049xts_enc_four
+       je      L053xts_enc_four
        movdqa  %xmm7,32(%esp)
        pshufd  $19,%xmm0,%xmm7
        movdqa  %xmm1,48(%esp)
@@ -1171,9 +1258,9 @@ L043xts_enc_short:
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
-       jmp     L050xts_enc_done
+       jmp     L054xts_enc_done
 .align 4,0x90
-L046xts_enc_one:
+L050xts_enc_one:
        movups  (%esi),%xmm2
        leal    16(%esi),%esi
        xorps   %xmm5,%xmm2
@@ -1181,37 +1268,36 @@ L046xts_enc_one:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L051enc1_loop_9:
+L055enc1_loop_9:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L051enc1_loop_9
+       jnz     L055enc1_loop_9
 .byte  102,15,56,221,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        movdqa  %xmm5,%xmm1
-       jmp     L050xts_enc_done
+       jmp     L054xts_enc_done
 .align 4,0x90
-L047xts_enc_two:
+L051xts_enc_two:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
        leal    32(%esi),%esi
        xorps   %xmm5,%xmm2
        xorps   %xmm6,%xmm3
-       xorps   %xmm4,%xmm4
-       call    __aesni_encrypt3
+       call    __aesni_encrypt2
        xorps   %xmm5,%xmm2
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        leal    32(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     L050xts_enc_done
+       jmp     L054xts_enc_done
 .align 4,0x90
-L048xts_enc_three:
+L052xts_enc_three:
        movaps  %xmm1,%xmm7
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1229,9 +1315,9 @@ L048xts_enc_three:
        movups  %xmm4,32(%edi)
        leal    48(%edi),%edi
        movdqa  %xmm7,%xmm1
-       jmp     L050xts_enc_done
+       jmp     L054xts_enc_done
 .align 4,0x90
-L049xts_enc_four:
+L053xts_enc_four:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1253,28 +1339,28 @@ L049xts_enc_four:
        movups  %xmm5,48(%edi)
        leal    64(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     L050xts_enc_done
+       jmp     L054xts_enc_done
 .align 4,0x90
-L045xts_enc_done6x:
+L049xts_enc_done6x:
        movl    112(%esp),%eax
        andl    $15,%eax
-       jz      L052xts_enc_ret
+       jz      L056xts_enc_ret
        movdqa  %xmm1,%xmm5
        movl    %eax,112(%esp)
-       jmp     L053xts_enc_steal
+       jmp     L057xts_enc_steal
 .align 4,0x90
-L050xts_enc_done:
+L054xts_enc_done:
        movl    112(%esp),%eax
        pxor    %xmm0,%xmm0
        andl    $15,%eax
-       jz      L052xts_enc_ret
+       jz      L056xts_enc_ret
        pcmpgtd %xmm1,%xmm0
        movl    %eax,112(%esp)
        pshufd  $19,%xmm0,%xmm5
        paddq   %xmm1,%xmm1
        pand    96(%esp),%xmm5
        pxor    %xmm1,%xmm5
-L053xts_enc_steal:
+L057xts_enc_steal:
        movzbl  (%esi),%ecx
        movzbl  -16(%edi),%edx
        leal    1(%esi),%esi
@@ -1282,7 +1368,7 @@ L053xts_enc_steal:
        movb    %dl,(%edi)
        leal    1(%edi),%edi
        subl    $1,%eax
-       jnz     L053xts_enc_steal
+       jnz     L057xts_enc_steal
        subl    112(%esp),%edi
        movl    %ebp,%edx
        movl    %ebx,%ecx
@@ -1292,16 +1378,30 @@ L053xts_enc_steal:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L054enc1_loop_10:
+L058enc1_loop_10:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L054enc1_loop_10
+       jnz     L058enc1_loop_10
 .byte  102,15,56,221,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,-16(%edi)
-L052xts_enc_ret:
+L056xts_enc_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       movdqa  %xmm0,(%esp)
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm0,16(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm0,80(%esp)
        movl    116(%esp),%esp
        popl    %edi
        popl    %esi
@@ -1324,12 +1424,12 @@ L_aesni_xts_decrypt_begin:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L055enc1_loop_11:
+L059enc1_loop_11:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L055enc1_loop_11
+       jnz     L059enc1_loop_11
 .byte  102,15,56,221,209
        movl    20(%esp),%esi
        movl    24(%esp),%edi
@@ -1358,12 +1458,14 @@ L055enc1_loop_11:
        pcmpgtd %xmm1,%xmm0
        andl    $-16,%eax
        subl    $96,%eax
-       jc      L056xts_dec_short
-       shrl    $1,%ecx
-       movl    %ecx,%ebx
-       jmp     L057xts_dec_loop6
+       jc      L060xts_dec_short
+       shll    $4,%ecx
+       movl    $16,%ebx
+       subl    %ecx,%ebx
+       leal    32(%edx,%ecx,1),%edx
+       jmp     L061xts_dec_loop6
 .align 4,0x90
-L057xts_dec_loop6:
+L061xts_dec_loop6:
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,(%esp)
@@ -1399,6 +1501,7 @@ L057xts_dec_loop6:
        pand    %xmm3,%xmm7
        movups  (%esi),%xmm2
        pxor    %xmm1,%xmm7
+       movl    %ebx,%ecx
        movdqu  16(%esi),%xmm3
        xorps   %xmm0,%xmm2
        movdqu  32(%esi),%xmm4
@@ -1414,19 +1517,17 @@ L057xts_dec_loop6:
        movdqa  %xmm7,80(%esp)
        pxor    %xmm1,%xmm7
        movups  16(%ebp),%xmm1
-       leal    32(%ebp),%edx
        pxor    16(%esp),%xmm3
-.byte  102,15,56,222,209
        pxor    32(%esp),%xmm4
-.byte  102,15,56,222,217
+.byte  102,15,56,222,209
        pxor    48(%esp),%xmm5
-       decl    %ecx
-.byte  102,15,56,222,225
        pxor    64(%esp),%xmm6
-.byte  102,15,56,222,233
+.byte  102,15,56,222,217
        pxor    %xmm0,%xmm7
+       movups  32(%ebp),%xmm0
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
 .byte  102,15,56,222,241
-       movups  (%edx),%xmm0
 .byte  102,15,56,222,249
        call    L_aesni_decrypt6_enter
        movdqa  80(%esp),%xmm1
@@ -1451,26 +1552,25 @@ L057xts_dec_loop6:
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
-       movl    %ebx,%ecx
        pxor    %xmm2,%xmm1
        subl    $96,%eax
-       jnc     L057xts_dec_loop6
-       leal    1(,%ecx,2),%ecx
+       jnc     L061xts_dec_loop6
+       movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movl    %ecx,%ebx
-L056xts_dec_short:
+L060xts_dec_short:
        addl    $96,%eax
-       jz      L058xts_dec_done6x
+       jz      L062xts_dec_done6x
        movdqa  %xmm1,%xmm5
        cmpl    $32,%eax
-       jb      L059xts_dec_one
+       jb      L063xts_dec_one
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-       je      L060xts_dec_two
+       je      L064xts_dec_two
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm6
@@ -1479,7 +1579,7 @@ L056xts_dec_short:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        cmpl    $64,%eax
-       jb      L061xts_dec_three
+       jb      L065xts_dec_three
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm7
@@ -1489,7 +1589,7 @@ L056xts_dec_short:
        pxor    %xmm2,%xmm1
        movdqa  %xmm5,(%esp)
        movdqa  %xmm6,16(%esp)
-       je      L062xts_dec_four
+       je      L066xts_dec_four
        movdqa  %xmm7,32(%esp)
        pshufd  $19,%xmm0,%xmm7
        movdqa  %xmm1,48(%esp)
@@ -1521,9 +1621,9 @@ L056xts_dec_short:
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
-       jmp     L063xts_dec_done
+       jmp     L067xts_dec_done
 .align 4,0x90
-L059xts_dec_one:
+L063xts_dec_one:
        movups  (%esi),%xmm2
        leal    16(%esi),%esi
        xorps   %xmm5,%xmm2
@@ -1531,36 +1631,36 @@ L059xts_dec_one:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L064dec1_loop_12:
+L068dec1_loop_12:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L064dec1_loop_12
+       jnz     L068dec1_loop_12
 .byte  102,15,56,223,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        movdqa  %xmm5,%xmm1
-       jmp     L063xts_dec_done
+       jmp     L067xts_dec_done
 .align 4,0x90
-L060xts_dec_two:
+L064xts_dec_two:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
        leal    32(%esi),%esi
        xorps   %xmm5,%xmm2
        xorps   %xmm6,%xmm3
-       call    __aesni_decrypt3
+       call    __aesni_decrypt2
        xorps   %xmm5,%xmm2
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        leal    32(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     L063xts_dec_done
+       jmp     L067xts_dec_done
 .align 4,0x90
-L061xts_dec_three:
+L065xts_dec_three:
        movaps  %xmm1,%xmm7
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1578,9 +1678,9 @@ L061xts_dec_three:
        movups  %xmm4,32(%edi)
        leal    48(%edi),%edi
        movdqa  %xmm7,%xmm1
-       jmp     L063xts_dec_done
+       jmp     L067xts_dec_done
 .align 4,0x90
-L062xts_dec_four:
+L066xts_dec_four:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1602,20 +1702,20 @@ L062xts_dec_four:
        movups  %xmm5,48(%edi)
        leal    64(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     L063xts_dec_done
+       jmp     L067xts_dec_done
 .align 4,0x90
-L058xts_dec_done6x:
+L062xts_dec_done6x:
        movl    112(%esp),%eax
        andl    $15,%eax
-       jz      L065xts_dec_ret
+       jz      L069xts_dec_ret
        movl    %eax,112(%esp)
-       jmp     L066xts_dec_only_one_more
+       jmp     L070xts_dec_only_one_more
 .align 4,0x90
-L063xts_dec_done:
+L067xts_dec_done:
        movl    112(%esp),%eax
        pxor    %xmm0,%xmm0
        andl    $15,%eax
-       jz      L065xts_dec_ret
+       jz      L069xts_dec_ret
        pcmpgtd %xmm1,%xmm0
        movl    %eax,112(%esp)
        pshufd  $19,%xmm0,%xmm2
@@ -1625,7 +1725,7 @@ L063xts_dec_done:
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-L066xts_dec_only_one_more:
+L070xts_dec_only_one_more:
        pshufd  $19,%xmm0,%xmm5
        movdqa  %xmm1,%xmm6
        paddq   %xmm1,%xmm1
@@ -1639,16 +1739,16 @@ L066xts_dec_only_one_more:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L067dec1_loop_13:
+L071dec1_loop_13:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L067dec1_loop_13
+       jnz     L071dec1_loop_13
 .byte  102,15,56,223,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
-L068xts_dec_steal:
+L072xts_dec_steal:
        movzbl  16(%esi),%ecx
        movzbl  (%edi),%edx
        leal    1(%esi),%esi
@@ -1656,7 +1756,7 @@ L068xts_dec_steal:
        movb    %dl,16(%edi)
        leal    1(%edi),%edi
        subl    $1,%eax
-       jnz     L068xts_dec_steal
+       jnz     L072xts_dec_steal
        subl    112(%esp),%edi
        movl    %ebp,%edx
        movl    %ebx,%ecx
@@ -1666,105 +1766,906 @@ L068xts_dec_steal:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L069dec1_loop_14:
+L073dec1_loop_14:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L069dec1_loop_14
+       jnz     L073dec1_loop_14
 .byte  102,15,56,223,209
        xorps   %xmm6,%xmm2
        movups  %xmm2,(%edi)
-L065xts_dec_ret:
+L069xts_dec_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       movdqa  %xmm0,(%esp)
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm0,16(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm0,80(%esp)
        movl    116(%esp),%esp
        popl    %edi
        popl    %esi
        popl    %ebx
        popl    %ebp
        ret
-.globl _aesni_cbc_encrypt
+.globl _aesni_ocb_encrypt
 .align 4
-_aesni_cbc_encrypt:
-L_aesni_cbc_encrypt_begin:
+_aesni_ocb_encrypt:
+L_aesni_ocb_encrypt_begin:
        pushl   %ebp
        pushl   %ebx
        pushl   %esi
        pushl   %edi
+       movl    40(%esp),%ecx
+       movl    48(%esp),%ebx
        movl    20(%esp),%esi
-       movl    %esp,%ebx
        movl    24(%esp),%edi
-       subl    $24,%ebx
        movl    28(%esp),%eax
-       andl    $-16,%ebx
        movl    32(%esp),%edx
+       movdqu  (%ecx),%xmm0
        movl    36(%esp),%ebp
-       testl   %eax,%eax
-       jz      L070cbc_abort
-       cmpl    $0,40(%esp)
-       xchgl   %esp,%ebx
-       movups  (%ebp),%xmm7
+       movdqu  (%ebx),%xmm1
+       movl    44(%esp),%ebx
+       movl    %esp,%ecx
+       subl    $132,%esp
+       andl    $-16,%esp
+       subl    %esi,%edi
+       shll    $4,%eax
+       leal    -96(%esi,%eax,1),%eax
+       movl    %edi,120(%esp)
+       movl    %eax,124(%esp)
+       movl    %ecx,128(%esp)
        movl    240(%edx),%ecx
-       movl    %edx,%ebp
-       movl    %ebx,16(%esp)
-       movl    %ecx,%ebx
-       je      L071cbc_decrypt
-       movaps  %xmm7,%xmm2
-       cmpl    $16,%eax
-       jb      L072cbc_enc_tail
-       subl    $16,%eax
-       jmp     L073cbc_enc_loop
-.align 4,0x90
-L073cbc_enc_loop:
-       movups  (%esi),%xmm7
+       testl   $1,%ebp
+       jnz     L074odd
+       bsfl    %ebp,%eax
+       addl    $1,%ebp
+       shll    $4,%eax
+       movdqu  (%ebx,%eax,1),%xmm7
+       movl    %edx,%eax
+       movdqu  (%esi),%xmm2
        leal    16(%esi),%esi
+       pxor    %xmm0,%xmm7
+       pxor    %xmm2,%xmm1
+       pxor    %xmm7,%xmm2
+       movdqa  %xmm1,%xmm6
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
-       xorps   %xmm0,%xmm7
        leal    32(%edx),%edx
-       xorps   %xmm7,%xmm2
-L074enc1_loop_15:
+       xorps   %xmm0,%xmm2
+L075enc1_loop_15:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L074enc1_loop_15
+       jnz     L075enc1_loop_15
 .byte  102,15,56,221,209
-       movl    %ebx,%ecx
-       movl    %ebp,%edx
-       movups  %xmm2,(%edi)
-       leal    16(%edi),%edi
-       subl    $16,%eax
-       jnc     L073cbc_enc_loop
-       addl    $16,%eax
-       jnz     L072cbc_enc_tail
-       movaps  %xmm2,%xmm7
-       jmp     L075cbc_ret
-L072cbc_enc_tail:
-       movl    %eax,%ecx
-.long  2767451785
-       movl    $16,%ecx
-       subl    %eax,%ecx
-       xorl    %eax,%eax
-.long  2868115081
-       leal    -16(%edi),%edi
-       movl    %ebx,%ecx
-       movl    %edi,%esi
-       movl    %ebp,%edx
-       jmp     L073cbc_enc_loop
-.align 4,0x90
-L071cbc_decrypt:
-       cmpl    $80,%eax
-       jbe     L076cbc_dec_tail
-       movaps  %xmm7,(%esp)
-       subl    $80,%eax
-       jmp     L077cbc_dec_loop6_enter
-.align 4,0x90
-L078cbc_dec_loop6:
-       movaps  %xmm0,(%esp)
-       movups  %xmm7,(%edi)
-       leal    16(%edi),%edi
-L077cbc_dec_loop6_enter:
-       movdqu  (%esi),%xmm2
+       xorps   %xmm7,%xmm2
+       movdqa  %xmm7,%xmm0
+       movdqa  %xmm6,%xmm1
+       movups  %xmm2,-16(%edi,%esi,1)
+       movl    240(%eax),%ecx
+       movl    %eax,%edx
+       movl    124(%esp),%eax
+L074odd:
+       shll    $4,%ecx
+       movl    $16,%edi
+       subl    %ecx,%edi
+       movl    %edx,112(%esp)
+       leal    32(%edx,%ecx,1),%edx
+       movl    %edi,116(%esp)
+       cmpl    %eax,%esi
+       ja      L076short
+       jmp     L077grandloop
+.align 5,0x90
+L077grandloop:
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       leal    5(%ebp),%edi
+       addl    $6,%ebp
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       bsfl    %edi,%edi
+       shll    $4,%ecx
+       shll    $4,%eax
+       shll    $4,%edi
+       movdqu  (%ebx),%xmm2
+       movdqu  (%ebx,%ecx,1),%xmm3
+       movl    116(%esp),%ecx
+       movdqa  %xmm2,%xmm4
+       movdqu  (%ebx,%eax,1),%xmm5
+       movdqa  %xmm2,%xmm6
+       movdqu  (%ebx,%edi,1),%xmm7
+       pxor    %xmm0,%xmm2
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm3,%xmm4
+       movdqa  %xmm3,16(%esp)
+       pxor    %xmm4,%xmm5
+       movdqa  %xmm4,32(%esp)
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm5,48(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm6,64(%esp)
+       movdqa  %xmm7,80(%esp)
+       movups  -48(%edx,%ecx,1),%xmm0
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movdqu  64(%esi),%xmm6
+       movdqu  80(%esi),%xmm7
+       leal    96(%esi),%esi
+       pxor    %xmm2,%xmm1
+       pxor    %xmm0,%xmm2
+       pxor    %xmm3,%xmm1
+       pxor    %xmm0,%xmm3
+       pxor    %xmm4,%xmm1
+       pxor    %xmm0,%xmm4
+       pxor    %xmm5,%xmm1
+       pxor    %xmm0,%xmm5
+       pxor    %xmm6,%xmm1
+       pxor    %xmm0,%xmm6
+       pxor    %xmm7,%xmm1
+       pxor    %xmm0,%xmm7
+       movdqa  %xmm1,96(%esp)
+       movups  -32(%edx,%ecx,1),%xmm1
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       pxor    80(%esp),%xmm7
+       movups  -16(%edx,%ecx,1),%xmm0
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+.byte  102,15,56,220,241
+.byte  102,15,56,220,249
+       movl    120(%esp),%edi
+       movl    124(%esp),%eax
+       call    L_aesni_encrypt6_enter
+       movdqa  80(%esp),%xmm0
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       pxor    %xmm0,%xmm7
+       movdqa  96(%esp),%xmm1
+       movdqu  %xmm2,-96(%edi,%esi,1)
+       movdqu  %xmm3,-80(%edi,%esi,1)
+       movdqu  %xmm4,-64(%edi,%esi,1)
+       movdqu  %xmm5,-48(%edi,%esi,1)
+       movdqu  %xmm6,-32(%edi,%esi,1)
+       movdqu  %xmm7,-16(%edi,%esi,1)
+       cmpl    %eax,%esi
+       jb      L077grandloop
+L076short:
+       addl    $96,%eax
+       subl    %esi,%eax
+       jz      L078done
+       cmpl    $32,%eax
+       jb      L079one
+       je      L080two
+       cmpl    $64,%eax
+       jb      L081three
+       je      L082four
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       shll    $4,%ecx
+       shll    $4,%eax
+       movdqu  (%ebx),%xmm2
+       movdqu  (%ebx,%ecx,1),%xmm3
+       movl    116(%esp),%ecx
+       movdqa  %xmm2,%xmm4
+       movdqu  (%ebx,%eax,1),%xmm5
+       movdqa  %xmm2,%xmm6
+       pxor    %xmm0,%xmm2
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm3,%xmm4
+       movdqa  %xmm3,16(%esp)
+       pxor    %xmm4,%xmm5
+       movdqa  %xmm4,32(%esp)
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm5,48(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm6,64(%esp)
+       movups  -48(%edx,%ecx,1),%xmm0
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movdqu  64(%esi),%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm2,%xmm1
+       pxor    %xmm0,%xmm2
+       pxor    %xmm3,%xmm1
+       pxor    %xmm0,%xmm3
+       pxor    %xmm4,%xmm1
+       pxor    %xmm0,%xmm4
+       pxor    %xmm5,%xmm1
+       pxor    %xmm0,%xmm5
+       pxor    %xmm6,%xmm1
+       pxor    %xmm0,%xmm6
+       movdqa  %xmm1,96(%esp)
+       movups  -32(%edx,%ecx,1),%xmm1
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       movups  -16(%edx,%ecx,1),%xmm0
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+.byte  102,15,56,220,241
+.byte  102,15,56,220,249
+       movl    120(%esp),%edi
+       call    L_aesni_encrypt6_enter
+       movdqa  64(%esp),%xmm0
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    %xmm0,%xmm6
+       movdqa  96(%esp),%xmm1
+       movdqu  %xmm2,(%edi,%esi,1)
+       movdqu  %xmm3,16(%edi,%esi,1)
+       movdqu  %xmm4,32(%edi,%esi,1)
+       movdqu  %xmm5,48(%edi,%esi,1)
+       movdqu  %xmm6,64(%edi,%esi,1)
+       jmp     L078done
+.align 4,0x90
+L079one:
+       movdqu  (%ebx),%xmm7
+       movl    112(%esp),%edx
+       movdqu  (%esi),%xmm2
+       movl    240(%edx),%ecx
+       pxor    %xmm0,%xmm7
+       pxor    %xmm2,%xmm1
+       pxor    %xmm7,%xmm2
+       movdqa  %xmm1,%xmm6
+       movl    120(%esp),%edi
+       movups  (%edx),%xmm0
+       movups  16(%edx),%xmm1
+       leal    32(%edx),%edx
+       xorps   %xmm0,%xmm2
+L083enc1_loop_16:
+.byte  102,15,56,220,209
+       decl    %ecx
+       movups  (%edx),%xmm1
+       leal    16(%edx),%edx
+       jnz     L083enc1_loop_16
+.byte  102,15,56,221,209
+       xorps   %xmm7,%xmm2
+       movdqa  %xmm7,%xmm0
+       movdqa  %xmm6,%xmm1
+       movups  %xmm2,(%edi,%esi,1)
+       jmp     L078done
+.align 4,0x90
+L080two:
+       leal    1(%ebp),%ecx
+       movl    112(%esp),%edx
+       bsfl    %ecx,%ecx
+       shll    $4,%ecx
+       movdqu  (%ebx),%xmm6
+       movdqu  (%ebx,%ecx,1),%xmm7
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movl    240(%edx),%ecx
+       pxor    %xmm0,%xmm6
+       pxor    %xmm6,%xmm7
+       pxor    %xmm2,%xmm1
+       pxor    %xmm6,%xmm2
+       pxor    %xmm3,%xmm1
+       pxor    %xmm7,%xmm3
+       movdqa  %xmm1,%xmm5
+       movl    120(%esp),%edi
+       call    __aesni_encrypt2
+       xorps   %xmm6,%xmm2
+       xorps   %xmm7,%xmm3
+       movdqa  %xmm7,%xmm0
+       movdqa  %xmm5,%xmm1
+       movups  %xmm2,(%edi,%esi,1)
+       movups  %xmm3,16(%edi,%esi,1)
+       jmp     L078done
+.align 4,0x90
+L081three:
+       leal    1(%ebp),%ecx
+       movl    112(%esp),%edx
+       bsfl    %ecx,%ecx
+       shll    $4,%ecx
+       movdqu  (%ebx),%xmm5
+       movdqu  (%ebx,%ecx,1),%xmm6
+       movdqa  %xmm5,%xmm7
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movl    240(%edx),%ecx
+       pxor    %xmm0,%xmm5
+       pxor    %xmm5,%xmm6
+       pxor    %xmm6,%xmm7
+       pxor    %xmm2,%xmm1
+       pxor    %xmm5,%xmm2
+       pxor    %xmm3,%xmm1
+       pxor    %xmm6,%xmm3
+       pxor    %xmm4,%xmm1
+       pxor    %xmm7,%xmm4
+       movdqa  %xmm1,96(%esp)
+       movl    120(%esp),%edi
+       call    __aesni_encrypt3
+       xorps   %xmm5,%xmm2
+       xorps   %xmm6,%xmm3
+       xorps   %xmm7,%xmm4
+       movdqa  %xmm7,%xmm0
+       movdqa  96(%esp),%xmm1
+       movups  %xmm2,(%edi,%esi,1)
+       movups  %xmm3,16(%edi,%esi,1)
+       movups  %xmm4,32(%edi,%esi,1)
+       jmp     L078done
+.align 4,0x90
+L082four:
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       movl    112(%esp),%edx
+       shll    $4,%ecx
+       shll    $4,%eax
+       movdqu  (%ebx),%xmm4
+       movdqu  (%ebx,%ecx,1),%xmm5
+       movdqa  %xmm4,%xmm6
+       movdqu  (%ebx,%eax,1),%xmm7
+       pxor    %xmm0,%xmm4
+       movdqu  (%esi),%xmm2
+       pxor    %xmm4,%xmm5
+       movdqu  16(%esi),%xmm3
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm4,(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm5,16(%esp)
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movl    240(%edx),%ecx
+       pxor    %xmm2,%xmm1
+       pxor    (%esp),%xmm2
+       pxor    %xmm3,%xmm1
+       pxor    16(%esp),%xmm3
+       pxor    %xmm4,%xmm1
+       pxor    %xmm6,%xmm4
+       pxor    %xmm5,%xmm1
+       pxor    %xmm7,%xmm5
+       movdqa  %xmm1,96(%esp)
+       movl    120(%esp),%edi
+       call    __aesni_encrypt4
+       xorps   (%esp),%xmm2
+       xorps   16(%esp),%xmm3
+       xorps   %xmm6,%xmm4
+       movups  %xmm2,(%edi,%esi,1)
+       xorps   %xmm7,%xmm5
+       movups  %xmm3,16(%edi,%esi,1)
+       movdqa  %xmm7,%xmm0
+       movups  %xmm4,32(%edi,%esi,1)
+       movdqa  96(%esp),%xmm1
+       movups  %xmm5,48(%edi,%esi,1)
+L078done:
+       movl    128(%esp),%edx
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm2,16(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm2,32(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm2,48(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm2,64(%esp)
+       movdqa  %xmm2,80(%esp)
+       movdqa  %xmm2,96(%esp)
+       leal    (%edx),%esp
+       movl    40(%esp),%ecx
+       movl    48(%esp),%ebx
+       movdqu  %xmm0,(%ecx)
+       pxor    %xmm0,%xmm0
+       movdqu  %xmm1,(%ebx)
+       pxor    %xmm1,%xmm1
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.globl _aesni_ocb_decrypt
+.align 4
+_aesni_ocb_decrypt:
+L_aesni_ocb_decrypt_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       movl    40(%esp),%ecx
+       movl    48(%esp),%ebx
+       movl    20(%esp),%esi
+       movl    24(%esp),%edi
+       movl    28(%esp),%eax
+       movl    32(%esp),%edx
+       movdqu  (%ecx),%xmm0
+       movl    36(%esp),%ebp
+       movdqu  (%ebx),%xmm1
+       movl    44(%esp),%ebx
+       movl    %esp,%ecx
+       subl    $132,%esp
+       andl    $-16,%esp
+       subl    %esi,%edi
+       shll    $4,%eax
+       leal    -96(%esi,%eax,1),%eax
+       movl    %edi,120(%esp)
+       movl    %eax,124(%esp)
+       movl    %ecx,128(%esp)
+       movl    240(%edx),%ecx
+       testl   $1,%ebp
+       jnz     L084odd
+       bsfl    %ebp,%eax
+       addl    $1,%ebp
+       shll    $4,%eax
+       movdqu  (%ebx,%eax,1),%xmm7
+       movl    %edx,%eax
+       movdqu  (%esi),%xmm2
+       leal    16(%esi),%esi
+       pxor    %xmm0,%xmm7
+       pxor    %xmm7,%xmm2
+       movdqa  %xmm1,%xmm6
+       movups  (%edx),%xmm0
+       movups  16(%edx),%xmm1
+       leal    32(%edx),%edx
+       xorps   %xmm0,%xmm2
+L085dec1_loop_17:
+.byte  102,15,56,222,209
+       decl    %ecx
+       movups  (%edx),%xmm1
+       leal    16(%edx),%edx
+       jnz     L085dec1_loop_17
+.byte  102,15,56,223,209
+       xorps   %xmm7,%xmm2
+       movaps  %xmm6,%xmm1
+       movdqa  %xmm7,%xmm0
+       xorps   %xmm2,%xmm1
+       movups  %xmm2,-16(%edi,%esi,1)
+       movl    240(%eax),%ecx
+       movl    %eax,%edx
+       movl    124(%esp),%eax
+L084odd:
+       shll    $4,%ecx
+       movl    $16,%edi
+       subl    %ecx,%edi
+       movl    %edx,112(%esp)
+       leal    32(%edx,%ecx,1),%edx
+       movl    %edi,116(%esp)
+       cmpl    %eax,%esi
+       ja      L086short
+       jmp     L087grandloop
+.align 5,0x90
+L087grandloop:
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       leal    5(%ebp),%edi
+       addl    $6,%ebp
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       bsfl    %edi,%edi
+       shll    $4,%ecx
+       shll    $4,%eax
+       shll    $4,%edi
+       movdqu  (%ebx),%xmm2
+       movdqu  (%ebx,%ecx,1),%xmm3
+       movl    116(%esp),%ecx
+       movdqa  %xmm2,%xmm4
+       movdqu  (%ebx,%eax,1),%xmm5
+       movdqa  %xmm2,%xmm6
+       movdqu  (%ebx,%edi,1),%xmm7
+       pxor    %xmm0,%xmm2
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm3,%xmm4
+       movdqa  %xmm3,16(%esp)
+       pxor    %xmm4,%xmm5
+       movdqa  %xmm4,32(%esp)
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm5,48(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm6,64(%esp)
+       movdqa  %xmm7,80(%esp)
+       movups  -48(%edx,%ecx,1),%xmm0
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movdqu  64(%esi),%xmm6
+       movdqu  80(%esi),%xmm7
+       leal    96(%esi),%esi
+       movdqa  %xmm1,96(%esp)
+       pxor    %xmm0,%xmm2
+       pxor    %xmm0,%xmm3
+       pxor    %xmm0,%xmm4
+       pxor    %xmm0,%xmm5
+       pxor    %xmm0,%xmm6
+       pxor    %xmm0,%xmm7
+       movups  -32(%edx,%ecx,1),%xmm1
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       pxor    80(%esp),%xmm7
+       movups  -16(%edx,%ecx,1),%xmm0
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+.byte  102,15,56,222,241
+.byte  102,15,56,222,249
+       movl    120(%esp),%edi
+       movl    124(%esp),%eax
+       call    L_aesni_decrypt6_enter
+       movdqa  80(%esp),%xmm0
+       pxor    (%esp),%xmm2
+       movdqa  96(%esp),%xmm1
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       pxor    %xmm0,%xmm7
+       pxor    %xmm2,%xmm1
+       movdqu  %xmm2,-96(%edi,%esi,1)
+       pxor    %xmm3,%xmm1
+       movdqu  %xmm3,-80(%edi,%esi,1)
+       pxor    %xmm4,%xmm1
+       movdqu  %xmm4,-64(%edi,%esi,1)
+       pxor    %xmm5,%xmm1
+       movdqu  %xmm5,-48(%edi,%esi,1)
+       pxor    %xmm6,%xmm1
+       movdqu  %xmm6,-32(%edi,%esi,1)
+       pxor    %xmm7,%xmm1
+       movdqu  %xmm7,-16(%edi,%esi,1)
+       cmpl    %eax,%esi
+       jb      L087grandloop
+L086short:
+       addl    $96,%eax
+       subl    %esi,%eax
+       jz      L088done
+       cmpl    $32,%eax
+       jb      L089one
+       je      L090two
+       cmpl    $64,%eax
+       jb      L091three
+       je      L092four
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       shll    $4,%ecx
+       shll    $4,%eax
+       movdqu  (%ebx),%xmm2
+       movdqu  (%ebx,%ecx,1),%xmm3
+       movl    116(%esp),%ecx
+       movdqa  %xmm2,%xmm4
+       movdqu  (%ebx,%eax,1),%xmm5
+       movdqa  %xmm2,%xmm6
+       pxor    %xmm0,%xmm2
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm3,%xmm4
+       movdqa  %xmm3,16(%esp)
+       pxor    %xmm4,%xmm5
+       movdqa  %xmm4,32(%esp)
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm5,48(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm6,64(%esp)
+       movups  -48(%edx,%ecx,1),%xmm0
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movdqu  64(%esi),%xmm6
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm1,96(%esp)
+       pxor    %xmm0,%xmm2
+       pxor    %xmm0,%xmm3
+       pxor    %xmm0,%xmm4
+       pxor    %xmm0,%xmm5
+       pxor    %xmm0,%xmm6
+       movups  -32(%edx,%ecx,1),%xmm1
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    64(%esp),%xmm6
+       movups  -16(%edx,%ecx,1),%xmm0
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+.byte  102,15,56,222,241
+.byte  102,15,56,222,249
+       movl    120(%esp),%edi
+       call    L_aesni_decrypt6_enter
+       movdqa  64(%esp),%xmm0
+       pxor    (%esp),%xmm2
+       movdqa  96(%esp),%xmm1
+       pxor    16(%esp),%xmm3
+       pxor    32(%esp),%xmm4
+       pxor    48(%esp),%xmm5
+       pxor    %xmm0,%xmm6
+       pxor    %xmm2,%xmm1
+       movdqu  %xmm2,(%edi,%esi,1)
+       pxor    %xmm3,%xmm1
+       movdqu  %xmm3,16(%edi,%esi,1)
+       pxor    %xmm4,%xmm1
+       movdqu  %xmm4,32(%edi,%esi,1)
+       pxor    %xmm5,%xmm1
+       movdqu  %xmm5,48(%edi,%esi,1)
+       pxor    %xmm6,%xmm1
+       movdqu  %xmm6,64(%edi,%esi,1)
+       jmp     L088done
+.align 4,0x90
+L089one:
+       movdqu  (%ebx),%xmm7
+       movl    112(%esp),%edx
+       movdqu  (%esi),%xmm2
+       movl    240(%edx),%ecx
+       pxor    %xmm0,%xmm7
+       pxor    %xmm7,%xmm2
+       movdqa  %xmm1,%xmm6
+       movl    120(%esp),%edi
+       movups  (%edx),%xmm0
+       movups  16(%edx),%xmm1
+       leal    32(%edx),%edx
+       xorps   %xmm0,%xmm2
+L093dec1_loop_18:
+.byte  102,15,56,222,209
+       decl    %ecx
+       movups  (%edx),%xmm1
+       leal    16(%edx),%edx
+       jnz     L093dec1_loop_18
+.byte  102,15,56,223,209
+       xorps   %xmm7,%xmm2
+       movaps  %xmm6,%xmm1
+       movdqa  %xmm7,%xmm0
+       xorps   %xmm2,%xmm1
+       movups  %xmm2,(%edi,%esi,1)
+       jmp     L088done
+.align 4,0x90
+L090two:
+       leal    1(%ebp),%ecx
+       movl    112(%esp),%edx
+       bsfl    %ecx,%ecx
+       shll    $4,%ecx
+       movdqu  (%ebx),%xmm6
+       movdqu  (%ebx,%ecx,1),%xmm7
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movl    240(%edx),%ecx
+       movdqa  %xmm1,%xmm5
+       pxor    %xmm0,%xmm6
+       pxor    %xmm6,%xmm7
+       pxor    %xmm6,%xmm2
+       pxor    %xmm7,%xmm3
+       movl    120(%esp),%edi
+       call    __aesni_decrypt2
+       xorps   %xmm6,%xmm2
+       xorps   %xmm7,%xmm3
+       movdqa  %xmm7,%xmm0
+       xorps   %xmm2,%xmm5
+       movups  %xmm2,(%edi,%esi,1)
+       xorps   %xmm3,%xmm5
+       movups  %xmm3,16(%edi,%esi,1)
+       movaps  %xmm5,%xmm1
+       jmp     L088done
+.align 4,0x90
+L091three:
+       leal    1(%ebp),%ecx
+       movl    112(%esp),%edx
+       bsfl    %ecx,%ecx
+       shll    $4,%ecx
+       movdqu  (%ebx),%xmm5
+       movdqu  (%ebx,%ecx,1),%xmm6
+       movdqa  %xmm5,%xmm7
+       movdqu  (%esi),%xmm2
+       movdqu  16(%esi),%xmm3
+       movdqu  32(%esi),%xmm4
+       movl    240(%edx),%ecx
+       movdqa  %xmm1,96(%esp)
+       pxor    %xmm0,%xmm5
+       pxor    %xmm5,%xmm6
+       pxor    %xmm6,%xmm7
+       pxor    %xmm5,%xmm2
+       pxor    %xmm6,%xmm3
+       pxor    %xmm7,%xmm4
+       movl    120(%esp),%edi
+       call    __aesni_decrypt3
+       movdqa  96(%esp),%xmm1
+       xorps   %xmm5,%xmm2
+       xorps   %xmm6,%xmm3
+       xorps   %xmm7,%xmm4
+       movups  %xmm2,(%edi,%esi,1)
+       pxor    %xmm2,%xmm1
+       movdqa  %xmm7,%xmm0
+       movups  %xmm3,16(%edi,%esi,1)
+       pxor    %xmm3,%xmm1
+       movups  %xmm4,32(%edi,%esi,1)
+       pxor    %xmm4,%xmm1
+       jmp     L088done
+.align 4,0x90
+L092four:
+       leal    1(%ebp),%ecx
+       leal    3(%ebp),%eax
+       bsfl    %ecx,%ecx
+       bsfl    %eax,%eax
+       movl    112(%esp),%edx
+       shll    $4,%ecx
+       shll    $4,%eax
+       movdqu  (%ebx),%xmm4
+       movdqu  (%ebx,%ecx,1),%xmm5
+       movdqa  %xmm4,%xmm6
+       movdqu  (%ebx,%eax,1),%xmm7
+       pxor    %xmm0,%xmm4
+       movdqu  (%esi),%xmm2
+       pxor    %xmm4,%xmm5
+       movdqu  16(%esi),%xmm3
+       pxor    %xmm5,%xmm6
+       movdqa  %xmm4,(%esp)
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm5,16(%esp)
+       movdqu  32(%esi),%xmm4
+       movdqu  48(%esi),%xmm5
+       movl    240(%edx),%ecx
+       movdqa  %xmm1,96(%esp)
+       pxor    (%esp),%xmm2
+       pxor    16(%esp),%xmm3
+       pxor    %xmm6,%xmm4
+       pxor    %xmm7,%xmm5
+       movl    120(%esp),%edi
+       call    __aesni_decrypt4
+       movdqa  96(%esp),%xmm1
+       xorps   (%esp),%xmm2
+       xorps   16(%esp),%xmm3
+       xorps   %xmm6,%xmm4
+       movups  %xmm2,(%edi,%esi,1)
+       pxor    %xmm2,%xmm1
+       xorps   %xmm7,%xmm5
+       movups  %xmm3,16(%edi,%esi,1)
+       pxor    %xmm3,%xmm1
+       movdqa  %xmm7,%xmm0
+       movups  %xmm4,32(%edi,%esi,1)
+       pxor    %xmm4,%xmm1
+       movups  %xmm5,48(%edi,%esi,1)
+       pxor    %xmm5,%xmm1
+L088done:
+       movl    128(%esp),%edx
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm2,(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm2,16(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm2,32(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm2,48(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm2,64(%esp)
+       movdqa  %xmm2,80(%esp)
+       movdqa  %xmm2,96(%esp)
+       leal    (%edx),%esp
+       movl    40(%esp),%ecx
+       movl    48(%esp),%ebx
+       movdqu  %xmm0,(%ecx)
+       pxor    %xmm0,%xmm0
+       movdqu  %xmm1,(%ebx)
+       pxor    %xmm1,%xmm1
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.globl _aesni_cbc_encrypt
+.align 4
+_aesni_cbc_encrypt:
+L_aesni_cbc_encrypt_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       movl    20(%esp),%esi
+       movl    %esp,%ebx
+       movl    24(%esp),%edi
+       subl    $24,%ebx
+       movl    28(%esp),%eax
+       andl    $-16,%ebx
+       movl    32(%esp),%edx
+       movl    36(%esp),%ebp
+       testl   %eax,%eax
+       jz      L094cbc_abort
+       cmpl    $0,40(%esp)
+       xchgl   %esp,%ebx
+       movups  (%ebp),%xmm7
+       movl    240(%edx),%ecx
+       movl    %edx,%ebp
+       movl    %ebx,16(%esp)
+       movl    %ecx,%ebx
+       je      L095cbc_decrypt
+       movaps  %xmm7,%xmm2
+       cmpl    $16,%eax
+       jb      L096cbc_enc_tail
+       subl    $16,%eax
+       jmp     L097cbc_enc_loop
+.align 4,0x90
+L097cbc_enc_loop:
+       movups  (%esi),%xmm7
+       leal    16(%esi),%esi
+       movups  (%edx),%xmm0
+       movups  16(%edx),%xmm1
+       xorps   %xmm0,%xmm7
+       leal    32(%edx),%edx
+       xorps   %xmm7,%xmm2
+L098enc1_loop_19:
+.byte  102,15,56,220,209
+       decl    %ecx
+       movups  (%edx),%xmm1
+       leal    16(%edx),%edx
+       jnz     L098enc1_loop_19
+.byte  102,15,56,221,209
+       movl    %ebx,%ecx
+       movl    %ebp,%edx
+       movups  %xmm2,(%edi)
+       leal    16(%edi),%edi
+       subl    $16,%eax
+       jnc     L097cbc_enc_loop
+       addl    $16,%eax
+       jnz     L096cbc_enc_tail
+       movaps  %xmm2,%xmm7
+       pxor    %xmm2,%xmm2
+       jmp     L099cbc_ret
+L096cbc_enc_tail:
+       movl    %eax,%ecx
+.long  2767451785
+       movl    $16,%ecx
+       subl    %eax,%ecx
+       xorl    %eax,%eax
+.long  2868115081
+       leal    -16(%edi),%edi
+       movl    %ebx,%ecx
+       movl    %edi,%esi
+       movl    %ebp,%edx
+       jmp     L097cbc_enc_loop
+.align 4,0x90
+L095cbc_decrypt:
+       cmpl    $80,%eax
+       jbe     L100cbc_dec_tail
+       movaps  %xmm7,(%esp)
+       subl    $80,%eax
+       jmp     L101cbc_dec_loop6_enter
+.align 4,0x90
+L102cbc_dec_loop6:
+       movaps  %xmm0,(%esp)
+       movups  %xmm7,(%edi)
+       leal    16(%edi),%edi
+L101cbc_dec_loop6_enter:
+       movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
        movdqu  48(%esi),%xmm5
@@ -1793,28 +2694,28 @@ L077cbc_dec_loop6_enter:
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
        subl    $96,%eax
-       ja      L078cbc_dec_loop6
+       ja      L102cbc_dec_loop6
        movaps  %xmm7,%xmm2
        movaps  %xmm0,%xmm7
        addl    $80,%eax
-       jle     L079cbc_dec_tail_collected
+       jle     L103cbc_dec_clear_tail_collected
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
-L076cbc_dec_tail:
+L100cbc_dec_tail:
        movups  (%esi),%xmm2
        movaps  %xmm2,%xmm6
        cmpl    $16,%eax
-       jbe     L080cbc_dec_one
+       jbe     L104cbc_dec_one
        movups  16(%esi),%xmm3
        movaps  %xmm3,%xmm5
        cmpl    $32,%eax
-       jbe     L081cbc_dec_two
+       jbe     L105cbc_dec_two
        movups  32(%esi),%xmm4
        cmpl    $48,%eax
-       jbe     L082cbc_dec_three
+       jbe     L106cbc_dec_three
        movups  48(%esi),%xmm5
        cmpl    $64,%eax
-       jbe     L083cbc_dec_four
+       jbe     L107cbc_dec_four
        movups  64(%esi),%xmm6
        movaps  %xmm7,(%esp)
        movups  (%esi),%xmm2
@@ -1832,56 +2733,62 @@ L076cbc_dec_tail:
        xorps   %xmm0,%xmm6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%edi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%edi)
+       pxor    %xmm5,%xmm5
        leal    64(%edi),%edi
        movaps  %xmm6,%xmm2
+       pxor    %xmm6,%xmm6
        subl    $80,%eax
-       jmp     L079cbc_dec_tail_collected
+       jmp     L108cbc_dec_tail_collected
 .align 4,0x90
-L080cbc_dec_one:
+L104cbc_dec_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L084dec1_loop_16:
+L109dec1_loop_20:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L084dec1_loop_16
+       jnz     L109dec1_loop_20
 .byte  102,15,56,223,209
        xorps   %xmm7,%xmm2
        movaps  %xmm6,%xmm7
        subl    $16,%eax
-       jmp     L079cbc_dec_tail_collected
+       jmp     L108cbc_dec_tail_collected
 .align 4,0x90
-L081cbc_dec_two:
-       xorps   %xmm4,%xmm4
-       call    __aesni_decrypt3
+L105cbc_dec_two:
+       call    __aesni_decrypt2
        xorps   %xmm7,%xmm2
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movaps  %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
        leal    16(%edi),%edi
        movaps  %xmm5,%xmm7
        subl    $32,%eax
-       jmp     L079cbc_dec_tail_collected
+       jmp     L108cbc_dec_tail_collected
 .align 4,0x90
-L082cbc_dec_three:
+L106cbc_dec_three:
        call    __aesni_decrypt3
        xorps   %xmm7,%xmm2
        xorps   %xmm6,%xmm3
        xorps   %xmm5,%xmm4
        movups  %xmm2,(%edi)
        movaps  %xmm4,%xmm2
+       pxor    %xmm4,%xmm4
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        leal    32(%edi),%edi
        movups  32(%esi),%xmm7
        subl    $48,%eax
-       jmp     L079cbc_dec_tail_collected
+       jmp     L108cbc_dec_tail_collected
 .align 4,0x90
-L083cbc_dec_four:
+L107cbc_dec_four:
        call    __aesni_decrypt4
        movups  16(%esi),%xmm1
        movups  32(%esi),%xmm0
@@ -1891,28 +2798,44 @@ L083cbc_dec_four:
        movups  %xmm2,(%edi)
        xorps   %xmm1,%xmm4
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        xorps   %xmm0,%xmm5
        movups  %xmm4,32(%edi)
+       pxor    %xmm4,%xmm4
        leal    48(%edi),%edi
        movaps  %xmm5,%xmm2
+       pxor    %xmm5,%xmm5
        subl    $64,%eax
-L079cbc_dec_tail_collected:
+       jmp     L108cbc_dec_tail_collected
+.align 4,0x90
+L103cbc_dec_clear_tail_collected:
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+L108cbc_dec_tail_collected:
        andl    $15,%eax
-       jnz     L085cbc_dec_tail_partial
+       jnz     L110cbc_dec_tail_partial
        movups  %xmm2,(%edi)
-       jmp     L075cbc_ret
+       pxor    %xmm0,%xmm0
+       jmp     L099cbc_ret
 .align 4,0x90
-L085cbc_dec_tail_partial:
+L110cbc_dec_tail_partial:
        movaps  %xmm2,(%esp)
+       pxor    %xmm0,%xmm0
        movl    $16,%ecx
        movl    %esp,%esi
        subl    %eax,%ecx
 .long  2767451785
-L075cbc_ret:
+       movdqa  %xmm2,(%esp)
+L099cbc_ret:
        movl    16(%esp),%esp
        movl    36(%esp),%ebp
+       pxor    %xmm2,%xmm2
+       pxor    %xmm1,%xmm1
        movups  %xmm7,(%ebp)
-L070cbc_abort:
+       pxor    %xmm7,%xmm7
+L094cbc_abort:
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -1920,52 +2843,62 @@ L070cbc_abort:
        ret
 .align 4
 __aesni_set_encrypt_key:
+       pushl   %ebp
+       pushl   %ebx
        testl   %eax,%eax
-       jz      L086bad_pointer
+       jz      L111bad_pointer
        testl   %edx,%edx
-       jz      L086bad_pointer
+       jz      L111bad_pointer
+       call    L112pic
+L112pic:
+       popl    %ebx
+       leal    Lkey_const-L112pic(%ebx),%ebx
+       movl    L__gnutls_x86_cpuid_s$non_lazy_ptr-Lkey_const(%ebx),%ebp
        movups  (%eax),%xmm0
        xorps   %xmm4,%xmm4
+       movl    4(%ebp),%ebp
        leal    16(%edx),%edx
+       andl    $268437504,%ebp
        cmpl    $256,%ecx
-       je      L08714rounds
+       je      L11314rounds
        cmpl    $192,%ecx
-       je      L08812rounds
+       je      L11412rounds
        cmpl    $128,%ecx
-       jne     L089bad_keybits
+       jne     L115bad_keybits
 .align 4,0x90
-L09010rounds:
+L11610rounds:
+       cmpl    $268435456,%ebp
+       je      L11710rounds_alt
        movl    $9,%ecx
        movups  %xmm0,-16(%edx)
 .byte  102,15,58,223,200,1
-       call    L091key_128_cold
+       call    L118key_128_cold
 .byte  102,15,58,223,200,2
-       call    L092key_128
+       call    L119key_128
 .byte  102,15,58,223,200,4
-       call    L092key_128
+       call    L119key_128
 .byte  102,15,58,223,200,8
-       call    L092key_128
+       call    L119key_128
 .byte  102,15,58,223,200,16
-       call    L092key_128
+       call    L119key_128
 .byte  102,15,58,223,200,32
-       call    L092key_128
+       call    L119key_128
 .byte  102,15,58,223,200,64
-       call    L092key_128
+       call    L119key_128
 .byte  102,15,58,223,200,128
-       call    L092key_128
+       call    L119key_128
 .byte  102,15,58,223,200,27
-       call    L092key_128
+       call    L119key_128
 .byte  102,15,58,223,200,54
-       call    L092key_128
+       call    L119key_128
        movups  %xmm0,(%edx)
        movl    %ecx,80(%edx)
-       xorl    %eax,%eax
-       ret
+       jmp     L120good_key
 .align 4,0x90
-L092key_128:
+L119key_128:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
-L091key_128_cold:
+L118key_128_cold:
        shufps  $16,%xmm0,%xmm4
        xorps   %xmm4,%xmm0
        shufps  $140,%xmm0,%xmm4
@@ -1974,38 +2907,91 @@ L091key_128_cold:
        xorps   %xmm1,%xmm0
        ret
 .align 4,0x90
-L08812rounds:
+L11710rounds_alt:
+       movdqa  (%ebx),%xmm5
+       movl    $8,%ecx
+       movdqa  32(%ebx),%xmm4
+       movdqa  %xmm0,%xmm2
+       movdqu  %xmm0,-16(%edx)
+L121loop_key128:
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       leal    16(%edx),%edx
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,-16(%edx)
+       movdqa  %xmm0,%xmm2
+       decl    %ecx
+       jnz     L121loop_key128
+       movdqa  48(%ebx),%xmm4
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%edx)
+       movdqa  %xmm0,%xmm2
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,16(%edx)
+       movl    $9,%ecx
+       movl    %ecx,96(%edx)
+       jmp     L120good_key
+.align 4,0x90
+L11412rounds:
        movq    16(%eax),%xmm2
+       cmpl    $268435456,%ebp
+       je      L12212rounds_alt
        movl    $11,%ecx
        movups  %xmm0,-16(%edx)
 .byte  102,15,58,223,202,1
-       call    L093key_192a_cold
+       call    L123key_192a_cold
 .byte  102,15,58,223,202,2
-       call    L094key_192b
+       call    L124key_192b
 .byte  102,15,58,223,202,4
-       call    L095key_192a
+       call    L125key_192a
 .byte  102,15,58,223,202,8
-       call    L094key_192b
+       call    L124key_192b
 .byte  102,15,58,223,202,16
-       call    L095key_192a
+       call    L125key_192a
 .byte  102,15,58,223,202,32
-       call    L094key_192b
+       call    L124key_192b
 .byte  102,15,58,223,202,64
-       call    L095key_192a
+       call    L125key_192a
 .byte  102,15,58,223,202,128
-       call    L094key_192b
+       call    L124key_192b
        movups  %xmm0,(%edx)
        movl    %ecx,48(%edx)
-       xorl    %eax,%eax
-       ret
+       jmp     L120good_key
 .align 4,0x90
-L095key_192a:
+L125key_192a:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
 .align 4,0x90
-L093key_192a_cold:
+L123key_192a_cold:
        movaps  %xmm2,%xmm5
-L096key_192b_warm:
+L126key_192b_warm:
        shufps  $16,%xmm0,%xmm4
        movdqa  %xmm2,%xmm3
        xorps   %xmm4,%xmm0
@@ -2019,56 +3005,90 @@ L096key_192b_warm:
        pxor    %xmm3,%xmm2
        ret
 .align 4,0x90
-L094key_192b:
+L124key_192b:
        movaps  %xmm0,%xmm3
        shufps  $68,%xmm0,%xmm5
        movups  %xmm5,(%edx)
        shufps  $78,%xmm2,%xmm3
        movups  %xmm3,16(%edx)
        leal    32(%edx),%edx
-       jmp     L096key_192b_warm
+       jmp     L126key_192b_warm
 .align 4,0x90
-L08714rounds:
+L12212rounds_alt:
+       movdqa  16(%ebx),%xmm5
+       movdqa  32(%ebx),%xmm4
+       movl    $8,%ecx
+       movdqu  %xmm0,-16(%edx)
+L127loop_key192:
+       movq    %xmm2,(%edx)
+       movdqa  %xmm2,%xmm1
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       pslld   $1,%xmm4
+       leal    24(%edx),%edx
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pshufd  $255,%xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pxor    %xmm2,%xmm0
+       pxor    %xmm3,%xmm2
+       movdqu  %xmm0,-16(%edx)
+       decl    %ecx
+       jnz     L127loop_key192
+       movl    $11,%ecx
+       movl    %ecx,32(%edx)
+       jmp     L120good_key
+.align 4,0x90
+L11314rounds:
        movups  16(%eax),%xmm2
-       movl    $13,%ecx
        leal    16(%edx),%edx
+       cmpl    $268435456,%ebp
+       je      L12814rounds_alt
+       movl    $13,%ecx
        movups  %xmm0,-32(%edx)
        movups  %xmm2,-16(%edx)
 .byte  102,15,58,223,202,1
-       call    L097key_256a_cold
+       call    L129key_256a_cold
 .byte  102,15,58,223,200,1
-       call    L098key_256b
+       call    L130key_256b
 .byte  102,15,58,223,202,2
-       call    L099key_256a
+       call    L131key_256a
 .byte  102,15,58,223,200,2
-       call    L098key_256b
+       call    L130key_256b
 .byte  102,15,58,223,202,4
-       call    L099key_256a
+       call    L131key_256a
 .byte  102,15,58,223,200,4
-       call    L098key_256b
+       call    L130key_256b
 .byte  102,15,58,223,202,8
-       call    L099key_256a
+       call    L131key_256a
 .byte  102,15,58,223,200,8
-       call    L098key_256b
+       call    L130key_256b
 .byte  102,15,58,223,202,16
-       call    L099key_256a
+       call    L131key_256a
 .byte  102,15,58,223,200,16
-       call    L098key_256b
+       call    L130key_256b
 .byte  102,15,58,223,202,32
-       call    L099key_256a
+       call    L131key_256a
 .byte  102,15,58,223,200,32
-       call    L098key_256b
+       call    L130key_256b
 .byte  102,15,58,223,202,64
-       call    L099key_256a
+       call    L131key_256a
        movups  %xmm0,(%edx)
        movl    %ecx,16(%edx)
        xorl    %eax,%eax
-       ret
+       jmp     L120good_key
 .align 4,0x90
-L099key_256a:
+L131key_256a:
        movups  %xmm2,(%edx)
        leal    16(%edx),%edx
-L097key_256a_cold:
+L129key_256a_cold:
        shufps  $16,%xmm0,%xmm4
        xorps   %xmm4,%xmm0
        shufps  $140,%xmm0,%xmm4
@@ -2077,7 +3097,7 @@ L097key_256a_cold:
        xorps   %xmm1,%xmm0
        ret
 .align 4,0x90
-L098key_256b:
+L130key_256b:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
        shufps  $16,%xmm2,%xmm4
@@ -2087,13 +3107,70 @@ L098key_256b:
        shufps  $170,%xmm1,%xmm1
        xorps   %xmm1,%xmm2
        ret
+.align 4,0x90
+L12814rounds_alt:
+       movdqa  (%ebx),%xmm5
+       movdqa  32(%ebx),%xmm4
+       movl    $7,%ecx
+       movdqu  %xmm0,-32(%edx)
+       movdqa  %xmm2,%xmm1
+       movdqu  %xmm2,-16(%edx)
+L132loop_key256:
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pslld   $1,%xmm4
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%edx)
+       decl    %ecx
+       jz      L133done_key256
+       pshufd  $255,%xmm0,%xmm2
+       pxor    %xmm3,%xmm3
+.byte  102,15,56,221,211
+       movdqa  %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm3,%xmm1
+       pxor    %xmm1,%xmm2
+       movdqu  %xmm2,16(%edx)
+       leal    32(%edx),%edx
+       movdqa  %xmm2,%xmm1
+       jmp     L132loop_key256
+L133done_key256:
+       movl    $13,%ecx
+       movl    %ecx,16(%edx)
+L120good_key:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       xorl    %eax,%eax
+       popl    %ebx
+       popl    %ebp
+       ret
 .align 2,0x90
-L086bad_pointer:
+L111bad_pointer:
        movl    $-1,%eax
+       popl    %ebx
+       popl    %ebp
        ret
 .align 2,0x90
-L089bad_keybits:
+L115bad_keybits:
+       pxor    %xmm0,%xmm0
        movl    $-2,%eax
+       popl    %ebx
+       popl    %ebp
        ret
 .globl _aesni_set_encrypt_key
 .align 4
@@ -2115,7 +3192,7 @@ L_aesni_set_decrypt_key_begin:
        movl    12(%esp),%edx
        shll    $4,%ecx
        testl   %eax,%eax
-       jnz     L100dec_key_ret
+       jnz     L134dec_key_ret
        leal    16(%edx,%ecx,1),%eax
        movups  (%edx),%xmm0
        movups  (%eax),%xmm1
@@ -2123,7 +3200,7 @@ L_aesni_set_decrypt_key_begin:
        movups  %xmm1,(%edx)
        leal    16(%edx),%edx
        leal    -16(%eax),%eax
-L101dec_key_inverse:
+L135dec_key_inverse:
        movups  (%edx),%xmm0
        movups  (%eax),%xmm1
 .byte  102,15,56,219,192
@@ -2133,15 +3210,28 @@ L101dec_key_inverse:
        movups  %xmm0,16(%eax)
        movups  %xmm1,-16(%edx)
        cmpl    %edx,%eax
-       ja      L101dec_key_inverse
+       ja      L135dec_key_inverse
        movups  (%edx),%xmm0
 .byte  102,15,56,219,192
        movups  %xmm0,(%edx)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        xorl    %eax,%eax
-L100dec_key_ret:
+L134dec_key_ret:
        ret
+.align 6,0x90
+Lkey_const:
+.long  202313229,202313229,202313229,202313229
+.long  67569157,67569157,67569157,67569157
+.long  1,1,1,1
+.long  27,27,27,27
 .byte  65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
 .byte  83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
 .byte  32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
 .byte  115,108,46,111,114,103,62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L__gnutls_x86_cpuid_s$non_lazy_ptr:
+.indirect_symbol       __gnutls_x86_cpuid_s
+.long  0
+.comm  __gnutls_x86_cpuid_s,16,2
 
index f0a5606348eca40a8e41a38a30c6adcf33b24459..f6145f166b5c928b3f9233fca2c2962421fe6ea0 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -43,6 +43,7 @@
 
 .p2align       4
 _aesni_encrypt:
+
        movups  (%rdi),%xmm2
        movl    240(%rdx),%eax
        movups  (%rdx),%xmm0
@@ -63,10 +64,12 @@ L$oop_enc1_1:
        .byte   0xf3,0xc3
 
 
+
 .globl _aesni_decrypt
 
 .p2align       4
 _aesni_decrypt:
+
        movups  (%rdi),%xmm2
        movl    240(%rdx),%eax
        movups  (%rdx),%xmm0
@@ -87,8 +90,10 @@ L$oop_dec1_2:
        .byte   0xf3,0xc3
 
 
+
 .p2align       4
 _aesni_encrypt2:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -116,8 +121,10 @@ L$enc_loop2:
        .byte   0xf3,0xc3
 
 
+
 .p2align       4
 _aesni_decrypt2:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -145,8 +152,10 @@ L$dec_loop2:
        .byte   0xf3,0xc3
 
 
+
 .p2align       4
 _aesni_encrypt3:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -179,8 +188,10 @@ L$enc_loop3:
        .byte   0xf3,0xc3
 
 
+
 .p2align       4
 _aesni_decrypt3:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -213,8 +224,10 @@ L$dec_loop3:
        .byte   0xf3,0xc3
 
 
+
 .p2align       4
 _aesni_encrypt4:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -253,8 +266,10 @@ L$enc_loop4:
        .byte   0xf3,0xc3
 
 
+
 .p2align       4
 _aesni_decrypt4:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -293,8 +308,10 @@ L$dec_loop4:
        .byte   0xf3,0xc3
 
 
+
 .p2align       4
 _aesni_encrypt6:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -347,8 +364,10 @@ L$enc_loop6_enter:
        .byte   0xf3,0xc3
 
 
+
 .p2align       4
 _aesni_decrypt6:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -401,8 +420,10 @@ L$dec_loop6_enter:
        .byte   0xf3,0xc3
 
 
+
 .p2align       4
 _aesni_encrypt8:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -465,8 +486,10 @@ L$enc_loop8_enter:
        .byte   0xf3,0xc3
 
 
+
 .p2align       4
 _aesni_decrypt8:
+
        movups  (%rcx),%xmm0
        shll    $4,%eax
        movups  16(%rcx),%xmm1
@@ -528,10 +551,12 @@ L$dec_loop8_enter:
 .byte  102,68,15,56,223,200
        .byte   0xf3,0xc3
 
+
 .globl _aesni_ecb_encrypt
 
 .p2align       4
 _aesni_ecb_encrypt:
+
        andq    $-16,%rdx
        jz      L$ecb_ret
 
@@ -870,6 +895,7 @@ L$ecb_ret:
        pxor    %xmm1,%xmm1
        .byte   0xf3,0xc3
 
+
 .globl _aesni_ccm64_encrypt_blocks
 
 .p2align       4
@@ -1034,6 +1060,7 @@ L$oop_enc1_6:
 
 .p2align       4
 _aesni_ctr32_encrypt_blocks:
+
        cmpq    $1,%rdx
        jne     L$ctr32_bulk
 
@@ -1063,11 +1090,12 @@ L$oop_enc1_7:
 
 .p2align       4
 L$ctr32_bulk:
-       leaq    (%rsp),%rax
+       leaq    (%rsp),%r11
+
        pushq   %rbp
+
        subq    $128,%rsp
        andq    $-16,%rsp
-       leaq    -8(%rax),%rbp
 
 
 
@@ -1076,7 +1104,7 @@ L$ctr32_bulk:
        movdqu  (%rcx),%xmm0
        movl    12(%r8),%r8d
        pxor    %xmm0,%xmm2
-       movl    12(%rcx),%r11d
+       movl    12(%rcx),%ebp
        movdqa  %xmm2,0(%rsp)
        bswapl  %r8d
        movdqa  %xmm2,%xmm3
@@ -1092,8 +1120,8 @@ L$ctr32_bulk:
        leaq    2(%r8),%rdx
        bswapl  %eax
        bswapl  %edx
-       xorl    %r11d,%eax
-       xorl    %r11d,%edx
+       xorl    %ebp,%eax
+       xorl    %ebp,%edx
 .byte  102,15,58,34,216,3
        leaq    3(%r8),%rax
        movdqa  %xmm3,16(%rsp)
@@ -1102,25 +1130,25 @@ L$ctr32_bulk:
        movq    %r10,%rdx
        leaq    4(%r8),%r10
        movdqa  %xmm4,32(%rsp)
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
        bswapl  %r10d
 .byte  102,15,58,34,232,3
-       xorl    %r11d,%r10d
+       xorl    %ebp,%r10d
        movdqa  %xmm5,48(%rsp)
        leaq    5(%r8),%r9
        movl    %r10d,64+12(%rsp)
        bswapl  %r9d
        leaq    6(%r8),%r10
        movl    240(%rcx),%eax
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
        bswapl  %r10d
        movl    %r9d,80+12(%rsp)
-       xorl    %r11d,%r10d
+       xorl    %ebp,%r10d
        leaq    7(%r8),%r9
        movl    %r10d,96+12(%rsp)
        bswapl  %r9d
        movl    __gnutls_x86_cpuid_s+4(%rip),%r10d
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
        andl    $71303168,%r10d
        movl    %r9d,112+12(%rsp)
 
@@ -1144,7 +1172,7 @@ L$ctr32_bulk:
 L$ctr32_6x:
        shll    $4,%eax
        movl    $48,%r10d
-       bswapl  %r11d
+       bswapl  %ebp
        leaq    32(%rcx,%rax,1),%rcx
        subq    %rax,%r10
        jmp     L$ctr32_loop6
@@ -1155,32 +1183,32 @@ L$ctr32_loop6:
        movups  -48(%rcx,%r10,1),%xmm0
 .byte  102,15,56,220,209
        movl    %r8d,%eax
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 .byte  102,15,56,220,217
 .byte  0x0f,0x38,0xf1,0x44,0x24,12
        leal    1(%r8),%eax
 .byte  102,15,56,220,225
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 .byte  0x0f,0x38,0xf1,0x44,0x24,28
 .byte  102,15,56,220,233
        leal    2(%r8),%eax
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 .byte  102,15,56,220,241
 .byte  0x0f,0x38,0xf1,0x44,0x24,44
        leal    3(%r8),%eax
 .byte  102,15,56,220,249
        movups  -32(%rcx,%r10,1),%xmm1
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 
 .byte  102,15,56,220,208
 .byte  0x0f,0x38,0xf1,0x44,0x24,60
        leal    4(%r8),%eax
 .byte  102,15,56,220,216
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 .byte  0x0f,0x38,0xf1,0x44,0x24,76
 .byte  102,15,56,220,224
        leal    5(%r8),%eax
-       xorl    %r11d,%eax
+       xorl    %ebp,%eax
 .byte  102,15,56,220,232
 .byte  0x0f,0x38,0xf1,0x44,0x24,92
        movq    %r10,%rax
@@ -1241,7 +1269,7 @@ L$ctr32_loop8:
        bswapl  %r9d
        movups  32-128(%rcx),%xmm0
 .byte  102,15,56,220,225
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
        nop
 .byte  102,15,56,220,233
        movl    %r9d,0+12(%rsp)
@@ -1254,7 +1282,7 @@ L$ctr32_loop8:
        bswapl  %r9d
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
@@ -1268,7 +1296,7 @@ L$ctr32_loop8:
        bswapl  %r9d
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
@@ -1282,7 +1310,7 @@ L$ctr32_loop8:
        bswapl  %r9d
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
@@ -1296,7 +1324,7 @@ L$ctr32_loop8:
        bswapl  %r9d
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
@@ -1310,7 +1338,7 @@ L$ctr32_loop8:
        bswapl  %r9d
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
@@ -1324,7 +1352,7 @@ L$ctr32_loop8:
        bswapl  %r9d
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
 .byte  0x66,0x90
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
@@ -1339,7 +1367,7 @@ L$ctr32_loop8:
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
 .byte  102,15,56,220,224
-       xorl    %r11d,%r9d
+       xorl    %ebp,%r9d
        movdqu  0(%rdi),%xmm10
 .byte  102,15,56,220,232
        movl    %r9d,112+12(%rsp)
@@ -1574,7 +1602,7 @@ L$ctr32_loop3:
 
 L$ctr32_done:
        xorps   %xmm0,%xmm0
-       xorl    %r11d,%r11d
+       xorl    %ebp,%ebp
        pxor    %xmm1,%xmm1
        pxor    %xmm2,%xmm2
        pxor    %xmm3,%xmm3
@@ -1598,20 +1626,25 @@ L$ctr32_done:
        pxor    %xmm14,%xmm14
        movaps  %xmm0,112(%rsp)
        pxor    %xmm15,%xmm15
-       leaq    (%rbp),%rsp
-       popq    %rbp
+       movq    -8(%r11),%rbp
+
+       leaq    (%r11),%rsp
+
 L$ctr32_epilogue:
        .byte   0xf3,0xc3
 
+
 .globl _aesni_xts_encrypt
 
 .p2align       4
 _aesni_xts_encrypt:
-       leaq    (%rsp),%rax
+
+       leaq    (%rsp),%r11
+
        pushq   %rbp
+
        subq    $112,%rsp
        andq    $-16,%rsp
-       leaq    -8(%rax),%rbp
        movups  (%r9),%xmm2
        movl    240(%r8),%eax
        movl    240(%rcx),%r10d
@@ -1627,7 +1660,7 @@ L$oop_enc1_8:
        jnz     L$oop_enc1_8
 .byte  102,15,56,221,209
        movups  (%rcx),%xmm0
-       movq    %rcx,%r11
+       movq    %rcx,%rbp
        movl    %r10d,%eax
        shll    $4,%r10d
        movq    %rdx,%r9
@@ -1683,9 +1716,9 @@ L$oop_enc1_8:
        jc      L$xts_enc_short
 
        movl    $16+96,%eax
-       leaq    32(%r11,%r10,1),%rcx
+       leaq    32(%rbp,%r10,1),%rcx
        subq    %r10,%rax
-       movups  16(%r11),%xmm1
+       movups  16(%rbp),%xmm1
        movq    %rax,%r10
        leaq    L$xts_magic(%rip),%r8
        jmp     L$xts_enc_grandloop
@@ -1710,7 +1743,7 @@ L$xts_enc_grandloop:
        movdqa  96(%rsp),%xmm9
        pxor    %xmm14,%xmm6
 .byte  102,15,56,220,233
-       movups  32(%r11),%xmm0
+       movups  32(%rbp),%xmm0
        leaq    96(%rdi),%rdi
        pxor    %xmm8,%xmm7
 
@@ -1719,7 +1752,7 @@ L$xts_enc_grandloop:
        pxor    %xmm9,%xmm11
        movdqa  %xmm10,0(%rsp)
 .byte  102,15,56,220,249
-       movups  48(%r11),%xmm1
+       movups  48(%rbp),%xmm1
        pxor    %xmm9,%xmm12
 
 .byte  102,15,56,220,208
@@ -1734,7 +1767,7 @@ L$xts_enc_grandloop:
        movdqa  %xmm14,64(%rsp)
 .byte  102,15,56,220,240
 .byte  102,15,56,220,248
-       movups  64(%r11),%xmm0
+       movups  64(%rbp),%xmm0
        movdqa  %xmm8,80(%rsp)
        pshufd  $0x5f,%xmm15,%xmm9
        jmp     L$xts_enc_loop6
@@ -1766,7 +1799,7 @@ L$xts_enc_loop6:
        psrad   $31,%xmm14
 .byte  102,15,56,220,217
        pand    %xmm8,%xmm14
-       movups  (%r11),%xmm10
+       movups  (%rbp),%xmm10
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
@@ -1834,10 +1867,10 @@ L$xts_enc_loop6:
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
        pxor    %xmm0,%xmm15
-       movups  (%r11),%xmm0
+       movups  (%rbp),%xmm0
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
-       movups  16(%r11),%xmm1
+       movups  16(%rbp),%xmm1
 
        pxor    %xmm15,%xmm14
 .byte  102,15,56,221,84,36,0
@@ -1864,7 +1897,7 @@ L$xts_enc_loop6:
 
        movl    $16+96,%eax
        subl    %r10d,%eax
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        shrl    $4,%eax
 
 L$xts_enc_short:
@@ -2020,7 +2053,7 @@ L$xts_enc_steal:
        jnz     L$xts_enc_steal
 
        subq    %r9,%rsi
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        movl    %r10d,%eax
 
        movups  -16(%rsi),%xmm2
@@ -2063,20 +2096,25 @@ L$xts_enc_ret:
        movaps  %xmm0,96(%rsp)
        pxor    %xmm14,%xmm14
        pxor    %xmm15,%xmm15
-       leaq    (%rbp),%rsp
-       popq    %rbp
+       movq    -8(%r11),%rbp
+
+       leaq    (%r11),%rsp
+
 L$xts_enc_epilogue:
        .byte   0xf3,0xc3
 
+
 .globl _aesni_xts_decrypt
 
 .p2align       4
 _aesni_xts_decrypt:
-       leaq    (%rsp),%rax
+
+       leaq    (%rsp),%r11
+
        pushq   %rbp
+
        subq    $112,%rsp
        andq    $-16,%rsp
-       leaq    -8(%rax),%rbp
        movups  (%r9),%xmm2
        movl    240(%r8),%eax
        movl    240(%rcx),%r10d
@@ -2098,7 +2136,7 @@ L$oop_enc1_11:
        subq    %rax,%rdx
 
        movups  (%rcx),%xmm0
-       movq    %rcx,%r11
+       movq    %rcx,%rbp
        movl    %r10d,%eax
        shll    $4,%r10d
        movq    %rdx,%r9
@@ -2154,9 +2192,9 @@ L$oop_enc1_11:
        jc      L$xts_dec_short
 
        movl    $16+96,%eax
-       leaq    32(%r11,%r10,1),%rcx
+       leaq    32(%rbp,%r10,1),%rcx
        subq    %r10,%rax
-       movups  16(%r11),%xmm1
+       movups  16(%rbp),%xmm1
        movq    %rax,%r10
        leaq    L$xts_magic(%rip),%r8
        jmp     L$xts_dec_grandloop
@@ -2181,7 +2219,7 @@ L$xts_dec_grandloop:
        movdqa  96(%rsp),%xmm9
        pxor    %xmm14,%xmm6
 .byte  102,15,56,222,233
-       movups  32(%r11),%xmm0
+       movups  32(%rbp),%xmm0
        leaq    96(%rdi),%rdi
        pxor    %xmm8,%xmm7
 
@@ -2190,7 +2228,7 @@ L$xts_dec_grandloop:
        pxor    %xmm9,%xmm11
        movdqa  %xmm10,0(%rsp)
 .byte  102,15,56,222,249
-       movups  48(%r11),%xmm1
+       movups  48(%rbp),%xmm1
        pxor    %xmm9,%xmm12
 
 .byte  102,15,56,222,208
@@ -2205,7 +2243,7 @@ L$xts_dec_grandloop:
        movdqa  %xmm14,64(%rsp)
 .byte  102,15,56,222,240
 .byte  102,15,56,222,248
-       movups  64(%r11),%xmm0
+       movups  64(%rbp),%xmm0
        movdqa  %xmm8,80(%rsp)
        pshufd  $0x5f,%xmm15,%xmm9
        jmp     L$xts_dec_loop6
@@ -2237,7 +2275,7 @@ L$xts_dec_loop6:
        psrad   $31,%xmm14
 .byte  102,15,56,222,217
        pand    %xmm8,%xmm14
-       movups  (%r11),%xmm10
+       movups  (%rbp),%xmm10
 .byte  102,15,56,222,225
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
@@ -2305,10 +2343,10 @@ L$xts_dec_loop6:
 .byte  102,15,56,222,225
 .byte  102,15,56,222,233
        pxor    %xmm0,%xmm15
-       movups  (%r11),%xmm0
+       movups  (%rbp),%xmm0
 .byte  102,15,56,222,241
 .byte  102,15,56,222,249
-       movups  16(%r11),%xmm1
+       movups  16(%rbp),%xmm1
 
        pxor    %xmm15,%xmm14
 .byte  102,15,56,223,84,36,0
@@ -2335,7 +2373,7 @@ L$xts_dec_loop6:
 
        movl    $16+96,%eax
        subl    %r10d,%eax
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        shrl    $4,%eax
 
 L$xts_dec_short:
@@ -2492,7 +2530,7 @@ L$xts_dec_done:
        jz      L$xts_dec_ret
 L$xts_dec_done2:
        movq    %r9,%rdx
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        movl    %r10d,%eax
 
        movups  (%rdi),%xmm2
@@ -2522,7 +2560,7 @@ L$xts_dec_steal:
        jnz     L$xts_dec_steal
 
        subq    %r9,%rsi
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        movl    %r10d,%eax
 
        movups  (%rsi),%xmm2
@@ -2565,172 +2603,1020 @@ L$xts_dec_ret:
        movaps  %xmm0,96(%rsp)
        pxor    %xmm14,%xmm14
        pxor    %xmm15,%xmm15
-       leaq    (%rbp),%rsp
-       popq    %rbp
+       movq    -8(%r11),%rbp
+
+       leaq    (%r11),%rsp
+
 L$xts_dec_epilogue:
        .byte   0xf3,0xc3
 
-.globl _aesni_cbc_encrypt
 
-.p2align       4
-_aesni_cbc_encrypt:
-       testq   %rdx,%rdx
-       jz      L$cbc_ret
+.globl _aesni_ocb_encrypt
+
+.p2align       5
+_aesni_ocb_encrypt:
+
+       leaq    (%rsp),%rax
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       movq    8(%rax),%rbx
+       movq    8+8(%rax),%rbp
 
        movl    240(%rcx),%r10d
        movq    %rcx,%r11
-       testl   %r9d,%r9d
-       jz      L$cbc_decrypt
+       shll    $4,%r10d
+       movups  (%rcx),%xmm9
+       movups  16(%rcx,%r10,1),%xmm1
 
-       movups  (%r8),%xmm2
-       movl    %r10d,%eax
-       cmpq    $16,%rdx
-       jb      L$cbc_enc_tail
-       subq    $16,%rdx
-       jmp     L$cbc_enc_loop
-.p2align       4
-L$cbc_enc_loop:
-       movups  (%rdi),%xmm3
-       leaq    16(%rdi),%rdi
+       movdqu  (%r9),%xmm15
+       pxor    %xmm1,%xmm9
+       pxor    %xmm1,%xmm15
 
-       movups  (%rcx),%xmm0
-       movups  16(%rcx),%xmm1
-       xorps   %xmm0,%xmm3
-       leaq    32(%rcx),%rcx
-       xorps   %xmm3,%xmm2
-L$oop_enc1_15:
-.byte  102,15,56,220,209
-       decl    %eax
-       movups  (%rcx),%xmm1
-       leaq    16(%rcx),%rcx
-       jnz     L$oop_enc1_15
-.byte  102,15,56,221,209
-       movl    %r10d,%eax
-       movq    %r11,%rcx
-       movups  %xmm2,0(%rsi)
-       leaq    16(%rsi),%rsi
-       subq    $16,%rdx
-       jnc     L$cbc_enc_loop
-       addq    $16,%rdx
-       jnz     L$cbc_enc_tail
-       pxor    %xmm0,%xmm0
-       pxor    %xmm1,%xmm1
-       movups  %xmm2,(%r8)
-       pxor    %xmm2,%xmm2
-       pxor    %xmm3,%xmm3
-       jmp     L$cbc_ret
+       movl    $16+32,%eax
+       leaq    32(%r11,%r10,1),%rcx
+       movups  16(%r11),%xmm1
+       subq    %r10,%rax
+       movq    %rax,%r10
 
-L$cbc_enc_tail:
-       movq    %rdx,%rcx
-       xchgq   %rdi,%rsi
-.long  0x9066A4F3
-       movl    $16,%ecx
-       subq    %rdx,%rcx
-       xorl    %eax,%eax
-.long  0x9066AAF3
-       leaq    -16(%rdi),%rdi
-       movl    %r10d,%eax
-       movq    %rdi,%rsi
-       movq    %r11,%rcx
-       xorq    %rdx,%rdx
-       jmp     L$cbc_enc_loop
+       movdqu  (%rbx),%xmm10
+       movdqu  (%rbp),%xmm8
 
-.p2align       4
-L$cbc_decrypt:
-       cmpq    $16,%rdx
-       jne     L$cbc_decrypt_bulk
+       testq   $1,%r8
+       jnz     L$ocb_enc_odd
 
+       bsfq    %r8,%r12
+       addq    $1,%r8
+       shlq    $4,%r12
+       movdqu  (%rbx,%r12,1),%xmm7
+       movdqu  (%rdi),%xmm2
+       leaq    16(%rdi),%rdi
 
+       call    __ocb_encrypt1
 
-       movdqu  (%rdi),%xmm2
-       movdqu  (%r8),%xmm3
-       movdqa  %xmm2,%xmm4
-       movups  (%rcx),%xmm0
-       movups  16(%rcx),%xmm1
-       leaq    32(%rcx),%rcx
-       xorps   %xmm0,%xmm2
-L$oop_dec1_16:
-.byte  102,15,56,222,209
-       decl    %r10d
-       movups  (%rcx),%xmm1
-       leaq    16(%rcx),%rcx
-       jnz     L$oop_dec1_16
-.byte  102,15,56,223,209
-       pxor    %xmm0,%xmm0
-       pxor    %xmm1,%xmm1
-       movdqu  %xmm4,(%r8)
-       xorps   %xmm3,%xmm2
-       pxor    %xmm3,%xmm3
+       movdqa  %xmm7,%xmm15
        movups  %xmm2,(%rsi)
-       pxor    %xmm2,%xmm2
-       jmp     L$cbc_ret
-.p2align       4
-L$cbc_decrypt_bulk:
-       leaq    (%rsp),%rax
-       pushq   %rbp
-       subq    $16,%rsp
-       andq    $-16,%rsp
-       leaq    -8(%rax),%rbp
-       movups  (%r8),%xmm10
-       movl    %r10d,%eax
-       cmpq    $0x50,%rdx
-       jbe     L$cbc_dec_tail
+       leaq    16(%rsi),%rsi
+       subq    $1,%rdx
+       jz      L$ocb_enc_done
+
+L$ocb_enc_odd:
+       leaq    1(%r8),%r12
+       leaq    3(%r8),%r13
+       leaq    5(%r8),%r14
+       leaq    6(%r8),%r8
+       bsfq    %r12,%r12
+       bsfq    %r13,%r13
+       bsfq    %r14,%r14
+       shlq    $4,%r12
+       shlq    $4,%r13
+       shlq    $4,%r14
 
-       movups  (%rcx),%xmm0
+       subq    $6,%rdx
+       jc      L$ocb_enc_short
+       jmp     L$ocb_enc_grandloop
+
+.p2align       5
+L$ocb_enc_grandloop:
        movdqu  0(%rdi),%xmm2
        movdqu  16(%rdi),%xmm3
-       movdqa  %xmm2,%xmm11
        movdqu  32(%rdi),%xmm4
-       movdqa  %xmm3,%xmm12
        movdqu  48(%rdi),%xmm5
-       movdqa  %xmm4,%xmm13
        movdqu  64(%rdi),%xmm6
-       movdqa  %xmm5,%xmm14
        movdqu  80(%rdi),%xmm7
-       movdqa  %xmm6,%xmm15
-       movl    __gnutls_x86_cpuid_s+4(%rip),%r9d
-       cmpq    $0x70,%rdx
-       jbe     L$cbc_dec_six_or_seven
+       leaq    96(%rdi),%rdi
+
+       call    __ocb_encrypt6
+
+       movups  %xmm2,0(%rsi)
+       movups  %xmm3,16(%rsi)
+       movups  %xmm4,32(%rsi)
+       movups  %xmm5,48(%rsi)
+       movups  %xmm6,64(%rsi)
+       movups  %xmm7,80(%rsi)
+       leaq    96(%rsi),%rsi
+       subq    $6,%rdx
+       jnc     L$ocb_enc_grandloop
+
+L$ocb_enc_short:
+       addq    $6,%rdx
+       jz      L$ocb_enc_done
+
+       movdqu  0(%rdi),%xmm2
+       cmpq    $2,%rdx
+       jb      L$ocb_enc_one
+       movdqu  16(%rdi),%xmm3
+       je      L$ocb_enc_two
+
+       movdqu  32(%rdi),%xmm4
+       cmpq    $4,%rdx
+       jb      L$ocb_enc_three
+       movdqu  48(%rdi),%xmm5
+       je      L$ocb_enc_four
+
+       movdqu  64(%rdi),%xmm6
+       pxor    %xmm7,%xmm7
+
+       call    __ocb_encrypt6
+
+       movdqa  %xmm14,%xmm15
+       movups  %xmm2,0(%rsi)
+       movups  %xmm3,16(%rsi)
+       movups  %xmm4,32(%rsi)
+       movups  %xmm5,48(%rsi)
+       movups  %xmm6,64(%rsi)
+
+       jmp     L$ocb_enc_done
 
-       andl    $71303168,%r9d
-       subq    $0x50,%rdx
-       cmpl    $4194304,%r9d
-       je      L$cbc_dec_loop6_enter
-       subq    $0x20,%rdx
-       leaq    112(%rcx),%rcx
-       jmp     L$cbc_dec_loop8_enter
 .p2align       4
-L$cbc_dec_loop8:
-       movups  %xmm9,(%rsi)
-       leaq    16(%rsi),%rsi
-L$cbc_dec_loop8_enter:
-       movdqu  96(%rdi),%xmm8
-       pxor    %xmm0,%xmm2
-       movdqu  112(%rdi),%xmm9
-       pxor    %xmm0,%xmm3
-       movups  16-112(%rcx),%xmm1
-       pxor    %xmm0,%xmm4
-       xorq    %r11,%r11
-       cmpq    $0x70,%rdx
-       pxor    %xmm0,%xmm5
-       pxor    %xmm0,%xmm6
-       pxor    %xmm0,%xmm7
-       pxor    %xmm0,%xmm8
+L$ocb_enc_one:
+       movdqa  %xmm10,%xmm7
 
-.byte  102,15,56,222,209
-       pxor    %xmm0,%xmm9
-       movups  32-112(%rcx),%xmm0
-.byte  102,15,56,222,217
-.byte  102,15,56,222,225
-.byte  102,15,56,222,233
-.byte  102,15,56,222,241
-.byte  102,15,56,222,249
-.byte  102,68,15,56,222,193
-       setnc   %r11b
-       shlq    $7,%r11
-.byte  102,68,15,56,222,201
-       addq    %rdi,%r11
-       movups  48-112(%rcx),%xmm1
+       call    __ocb_encrypt1
+
+       movdqa  %xmm7,%xmm15
+       movups  %xmm2,0(%rsi)
+       jmp     L$ocb_enc_done
+
+.p2align       4
+L$ocb_enc_two:
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+
+       call    __ocb_encrypt4
+
+       movdqa  %xmm11,%xmm15
+       movups  %xmm2,0(%rsi)
+       movups  %xmm3,16(%rsi)
+
+       jmp     L$ocb_enc_done
+
+.p2align       4
+L$ocb_enc_three:
+       pxor    %xmm5,%xmm5
+
+       call    __ocb_encrypt4
+
+       movdqa  %xmm12,%xmm15
+       movups  %xmm2,0(%rsi)
+       movups  %xmm3,16(%rsi)
+       movups  %xmm4,32(%rsi)
+
+       jmp     L$ocb_enc_done
+
+.p2align       4
+L$ocb_enc_four:
+       call    __ocb_encrypt4
+
+       movdqa  %xmm13,%xmm15
+       movups  %xmm2,0(%rsi)
+       movups  %xmm3,16(%rsi)
+       movups  %xmm4,32(%rsi)
+       movups  %xmm5,48(%rsi)
+
+L$ocb_enc_done:
+       pxor    %xmm0,%xmm15
+       movdqu  %xmm8,(%rbp)
+       movdqu  %xmm15,(%r9)
+
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
+       pxor    %xmm10,%xmm10
+       pxor    %xmm11,%xmm11
+       pxor    %xmm12,%xmm12
+       pxor    %xmm13,%xmm13
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
+       leaq    40(%rsp),%rax
+
+       movq    -40(%rax),%r14
+
+       movq    -32(%rax),%r13
+
+       movq    -24(%rax),%r12
+
+       movq    -16(%rax),%rbp
+
+       movq    -8(%rax),%rbx
+
+       leaq    (%rax),%rsp
+
+L$ocb_enc_epilogue:
+       .byte   0xf3,0xc3
+
+
+
+
+.p2align       5
+__ocb_encrypt6:
+       pxor    %xmm9,%xmm15
+       movdqu  (%rbx,%r12,1),%xmm11
+       movdqa  %xmm10,%xmm12
+       movdqu  (%rbx,%r13,1),%xmm13
+       movdqa  %xmm10,%xmm14
+       pxor    %xmm15,%xmm10
+       movdqu  (%rbx,%r14,1),%xmm15
+       pxor    %xmm10,%xmm11
+       pxor    %xmm2,%xmm8
+       pxor    %xmm10,%xmm2
+       pxor    %xmm11,%xmm12
+       pxor    %xmm3,%xmm8
+       pxor    %xmm11,%xmm3
+       pxor    %xmm12,%xmm13
+       pxor    %xmm4,%xmm8
+       pxor    %xmm12,%xmm4
+       pxor    %xmm13,%xmm14
+       pxor    %xmm5,%xmm8
+       pxor    %xmm13,%xmm5
+       pxor    %xmm14,%xmm15
+       pxor    %xmm6,%xmm8
+       pxor    %xmm14,%xmm6
+       pxor    %xmm7,%xmm8
+       pxor    %xmm15,%xmm7
+       movups  32(%r11),%xmm0
+
+       leaq    1(%r8),%r12
+       leaq    3(%r8),%r13
+       leaq    5(%r8),%r14
+       addq    $6,%r8
+       pxor    %xmm9,%xmm10
+       bsfq    %r12,%r12
+       bsfq    %r13,%r13
+       bsfq    %r14,%r14
+
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       pxor    %xmm9,%xmm11
+       pxor    %xmm9,%xmm12
+.byte  102,15,56,220,241
+       pxor    %xmm9,%xmm13
+       pxor    %xmm9,%xmm14
+.byte  102,15,56,220,249
+       movups  48(%r11),%xmm1
+       pxor    %xmm9,%xmm15
+
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+.byte  102,15,56,220,240
+.byte  102,15,56,220,248
+       movups  64(%r11),%xmm0
+       shlq    $4,%r12
+       shlq    $4,%r13
+       jmp     L$ocb_enc_loop6
+
+.p2align       5
+L$ocb_enc_loop6:
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+.byte  102,15,56,220,241
+.byte  102,15,56,220,249
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+.byte  102,15,56,220,240
+.byte  102,15,56,220,248
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     L$ocb_enc_loop6
+
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+.byte  102,15,56,220,241
+.byte  102,15,56,220,249
+       movups  16(%r11),%xmm1
+       shlq    $4,%r14
+
+.byte  102,65,15,56,221,210
+       movdqu  (%rbx),%xmm10
+       movq    %r10,%rax
+.byte  102,65,15,56,221,219
+.byte  102,65,15,56,221,228
+.byte  102,65,15,56,221,237
+.byte  102,65,15,56,221,246
+.byte  102,65,15,56,221,255
+       .byte   0xf3,0xc3
+
+
+
+.p2align       5
+__ocb_encrypt4:
+       pxor    %xmm9,%xmm15
+       movdqu  (%rbx,%r12,1),%xmm11
+       movdqa  %xmm10,%xmm12
+       movdqu  (%rbx,%r13,1),%xmm13
+       pxor    %xmm15,%xmm10
+       pxor    %xmm10,%xmm11
+       pxor    %xmm2,%xmm8
+       pxor    %xmm10,%xmm2
+       pxor    %xmm11,%xmm12
+       pxor    %xmm3,%xmm8
+       pxor    %xmm11,%xmm3
+       pxor    %xmm12,%xmm13
+       pxor    %xmm4,%xmm8
+       pxor    %xmm12,%xmm4
+       pxor    %xmm5,%xmm8
+       pxor    %xmm13,%xmm5
+       movups  32(%r11),%xmm0
+
+       pxor    %xmm9,%xmm10
+       pxor    %xmm9,%xmm11
+       pxor    %xmm9,%xmm12
+       pxor    %xmm9,%xmm13
+
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       movups  48(%r11),%xmm1
+
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+       movups  64(%r11),%xmm0
+       jmp     L$ocb_enc_loop4
+
+.p2align       5
+L$ocb_enc_loop4:
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     L$ocb_enc_loop4
+
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       movups  16(%r11),%xmm1
+       movq    %r10,%rax
+
+.byte  102,65,15,56,221,210
+.byte  102,65,15,56,221,219
+.byte  102,65,15,56,221,228
+.byte  102,65,15,56,221,237
+       .byte   0xf3,0xc3
+
+
+
+.p2align       5
+__ocb_encrypt1:
+       pxor    %xmm15,%xmm7
+       pxor    %xmm9,%xmm7
+       pxor    %xmm2,%xmm8
+       pxor    %xmm7,%xmm2
+       movups  32(%r11),%xmm0
+
+.byte  102,15,56,220,209
+       movups  48(%r11),%xmm1
+       pxor    %xmm9,%xmm7
+
+.byte  102,15,56,220,208
+       movups  64(%r11),%xmm0
+       jmp     L$ocb_enc_loop1
+
+.p2align       5
+L$ocb_enc_loop1:
+.byte  102,15,56,220,209
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,220,208
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     L$ocb_enc_loop1
+
+.byte  102,15,56,220,209
+       movups  16(%r11),%xmm1
+       movq    %r10,%rax
+
+.byte  102,15,56,221,215
+       .byte   0xf3,0xc3
+
+
+.globl _aesni_ocb_decrypt
+
+.p2align       5
+_aesni_ocb_decrypt:
+
+       leaq    (%rsp),%rax
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       movq    8(%rax),%rbx
+       movq    8+8(%rax),%rbp
+
+       movl    240(%rcx),%r10d
+       movq    %rcx,%r11
+       shll    $4,%r10d
+       movups  (%rcx),%xmm9
+       movups  16(%rcx,%r10,1),%xmm1
+
+       movdqu  (%r9),%xmm15
+       pxor    %xmm1,%xmm9
+       pxor    %xmm1,%xmm15
+
+       movl    $16+32,%eax
+       leaq    32(%r11,%r10,1),%rcx
+       movups  16(%r11),%xmm1
+       subq    %r10,%rax
+       movq    %rax,%r10
+
+       movdqu  (%rbx),%xmm10
+       movdqu  (%rbp),%xmm8
+
+       testq   $1,%r8
+       jnz     L$ocb_dec_odd
+
+       bsfq    %r8,%r12
+       addq    $1,%r8
+       shlq    $4,%r12
+       movdqu  (%rbx,%r12,1),%xmm7
+       movdqu  (%rdi),%xmm2
+       leaq    16(%rdi),%rdi
+
+       call    __ocb_decrypt1
+
+       movdqa  %xmm7,%xmm15
+       movups  %xmm2,(%rsi)
+       xorps   %xmm2,%xmm8
+       leaq    16(%rsi),%rsi
+       subq    $1,%rdx
+       jz      L$ocb_dec_done
+
+L$ocb_dec_odd:
+       leaq    1(%r8),%r12
+       leaq    3(%r8),%r13
+       leaq    5(%r8),%r14
+       leaq    6(%r8),%r8
+       bsfq    %r12,%r12
+       bsfq    %r13,%r13
+       bsfq    %r14,%r14
+       shlq    $4,%r12
+       shlq    $4,%r13
+       shlq    $4,%r14
+
+       subq    $6,%rdx
+       jc      L$ocb_dec_short
+       jmp     L$ocb_dec_grandloop
+
+.p2align       5
+L$ocb_dec_grandloop:
+       movdqu  0(%rdi),%xmm2
+       movdqu  16(%rdi),%xmm3
+       movdqu  32(%rdi),%xmm4
+       movdqu  48(%rdi),%xmm5
+       movdqu  64(%rdi),%xmm6
+       movdqu  80(%rdi),%xmm7
+       leaq    96(%rdi),%rdi
+
+       call    __ocb_decrypt6
+
+       movups  %xmm2,0(%rsi)
+       pxor    %xmm2,%xmm8
+       movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm8
+       movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm8
+       movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm8
+       movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm8
+       movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm8
+       leaq    96(%rsi),%rsi
+       subq    $6,%rdx
+       jnc     L$ocb_dec_grandloop
+
+L$ocb_dec_short:
+       addq    $6,%rdx
+       jz      L$ocb_dec_done
+
+       movdqu  0(%rdi),%xmm2
+       cmpq    $2,%rdx
+       jb      L$ocb_dec_one
+       movdqu  16(%rdi),%xmm3
+       je      L$ocb_dec_two
+
+       movdqu  32(%rdi),%xmm4
+       cmpq    $4,%rdx
+       jb      L$ocb_dec_three
+       movdqu  48(%rdi),%xmm5
+       je      L$ocb_dec_four
+
+       movdqu  64(%rdi),%xmm6
+       pxor    %xmm7,%xmm7
+
+       call    __ocb_decrypt6
+
+       movdqa  %xmm14,%xmm15
+       movups  %xmm2,0(%rsi)
+       pxor    %xmm2,%xmm8
+       movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm8
+       movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm8
+       movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm8
+       movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm8
+
+       jmp     L$ocb_dec_done
+
+.p2align       4
+L$ocb_dec_one:
+       movdqa  %xmm10,%xmm7
+
+       call    __ocb_decrypt1
+
+       movdqa  %xmm7,%xmm15
+       movups  %xmm2,0(%rsi)
+       xorps   %xmm2,%xmm8
+       jmp     L$ocb_dec_done
+
+.p2align       4
+L$ocb_dec_two:
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+
+       call    __ocb_decrypt4
+
+       movdqa  %xmm11,%xmm15
+       movups  %xmm2,0(%rsi)
+       xorps   %xmm2,%xmm8
+       movups  %xmm3,16(%rsi)
+       xorps   %xmm3,%xmm8
+
+       jmp     L$ocb_dec_done
+
+.p2align       4
+L$ocb_dec_three:
+       pxor    %xmm5,%xmm5
+
+       call    __ocb_decrypt4
+
+       movdqa  %xmm12,%xmm15
+       movups  %xmm2,0(%rsi)
+       xorps   %xmm2,%xmm8
+       movups  %xmm3,16(%rsi)
+       xorps   %xmm3,%xmm8
+       movups  %xmm4,32(%rsi)
+       xorps   %xmm4,%xmm8
+
+       jmp     L$ocb_dec_done
+
+.p2align       4
+L$ocb_dec_four:
+       call    __ocb_decrypt4
+
+       movdqa  %xmm13,%xmm15
+       movups  %xmm2,0(%rsi)
+       pxor    %xmm2,%xmm8
+       movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm8
+       movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm8
+       movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm8
+
+L$ocb_dec_done:
+       pxor    %xmm0,%xmm15
+       movdqu  %xmm8,(%rbp)
+       movdqu  %xmm15,(%r9)
+
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
+       pxor    %xmm10,%xmm10
+       pxor    %xmm11,%xmm11
+       pxor    %xmm12,%xmm12
+       pxor    %xmm13,%xmm13
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
+       leaq    40(%rsp),%rax
+
+       movq    -40(%rax),%r14
+
+       movq    -32(%rax),%r13
+
+       movq    -24(%rax),%r12
+
+       movq    -16(%rax),%rbp
+
+       movq    -8(%rax),%rbx
+
+       leaq    (%rax),%rsp
+
+L$ocb_dec_epilogue:
+       .byte   0xf3,0xc3
+
+
+
+
+.p2align       5
+__ocb_decrypt6:
+       pxor    %xmm9,%xmm15
+       movdqu  (%rbx,%r12,1),%xmm11
+       movdqa  %xmm10,%xmm12
+       movdqu  (%rbx,%r13,1),%xmm13
+       movdqa  %xmm10,%xmm14
+       pxor    %xmm15,%xmm10
+       movdqu  (%rbx,%r14,1),%xmm15
+       pxor    %xmm10,%xmm11
+       pxor    %xmm10,%xmm2
+       pxor    %xmm11,%xmm12
+       pxor    %xmm11,%xmm3
+       pxor    %xmm12,%xmm13
+       pxor    %xmm12,%xmm4
+       pxor    %xmm13,%xmm14
+       pxor    %xmm13,%xmm5
+       pxor    %xmm14,%xmm15
+       pxor    %xmm14,%xmm6
+       pxor    %xmm15,%xmm7
+       movups  32(%r11),%xmm0
+
+       leaq    1(%r8),%r12
+       leaq    3(%r8),%r13
+       leaq    5(%r8),%r14
+       addq    $6,%r8
+       pxor    %xmm9,%xmm10
+       bsfq    %r12,%r12
+       bsfq    %r13,%r13
+       bsfq    %r14,%r14
+
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+       pxor    %xmm9,%xmm11
+       pxor    %xmm9,%xmm12
+.byte  102,15,56,222,241
+       pxor    %xmm9,%xmm13
+       pxor    %xmm9,%xmm14
+.byte  102,15,56,222,249
+       movups  48(%r11),%xmm1
+       pxor    %xmm9,%xmm15
+
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+.byte  102,15,56,222,240
+.byte  102,15,56,222,248
+       movups  64(%r11),%xmm0
+       shlq    $4,%r12
+       shlq    $4,%r13
+       jmp     L$ocb_dec_loop6
+
+.p2align       5
+L$ocb_dec_loop6:
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+.byte  102,15,56,222,241
+.byte  102,15,56,222,249
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+.byte  102,15,56,222,240
+.byte  102,15,56,222,248
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     L$ocb_dec_loop6
+
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+.byte  102,15,56,222,241
+.byte  102,15,56,222,249
+       movups  16(%r11),%xmm1
+       shlq    $4,%r14
+
+.byte  102,65,15,56,223,210
+       movdqu  (%rbx),%xmm10
+       movq    %r10,%rax
+.byte  102,65,15,56,223,219
+.byte  102,65,15,56,223,228
+.byte  102,65,15,56,223,237
+.byte  102,65,15,56,223,246
+.byte  102,65,15,56,223,255
+       .byte   0xf3,0xc3
+
+
+
+.p2align       5
+__ocb_decrypt4:
+       pxor    %xmm9,%xmm15
+       movdqu  (%rbx,%r12,1),%xmm11
+       movdqa  %xmm10,%xmm12
+       movdqu  (%rbx,%r13,1),%xmm13
+       pxor    %xmm15,%xmm10
+       pxor    %xmm10,%xmm11
+       pxor    %xmm10,%xmm2
+       pxor    %xmm11,%xmm12
+       pxor    %xmm11,%xmm3
+       pxor    %xmm12,%xmm13
+       pxor    %xmm12,%xmm4
+       pxor    %xmm13,%xmm5
+       movups  32(%r11),%xmm0
+
+       pxor    %xmm9,%xmm10
+       pxor    %xmm9,%xmm11
+       pxor    %xmm9,%xmm12
+       pxor    %xmm9,%xmm13
+
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+       movups  48(%r11),%xmm1
+
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+       movups  64(%r11),%xmm0
+       jmp     L$ocb_dec_loop4
+
+.p2align       5
+L$ocb_dec_loop4:
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     L$ocb_dec_loop4
+
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+       movups  16(%r11),%xmm1
+       movq    %r10,%rax
+
+.byte  102,65,15,56,223,210
+.byte  102,65,15,56,223,219
+.byte  102,65,15,56,223,228
+.byte  102,65,15,56,223,237
+       .byte   0xf3,0xc3
+
+
+
+.p2align       5
+__ocb_decrypt1:
+       pxor    %xmm15,%xmm7
+       pxor    %xmm9,%xmm7
+       pxor    %xmm7,%xmm2
+       movups  32(%r11),%xmm0
+
+.byte  102,15,56,222,209
+       movups  48(%r11),%xmm1
+       pxor    %xmm9,%xmm7
+
+.byte  102,15,56,222,208
+       movups  64(%r11),%xmm0
+       jmp     L$ocb_dec_loop1
+
+.p2align       5
+L$ocb_dec_loop1:
+.byte  102,15,56,222,209
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
+.byte  102,15,56,222,208
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     L$ocb_dec_loop1
+
+.byte  102,15,56,222,209
+       movups  16(%r11),%xmm1
+       movq    %r10,%rax
+
+.byte  102,15,56,223,215
+       .byte   0xf3,0xc3
+
+.globl _aesni_cbc_encrypt
+
+.p2align       4
+_aesni_cbc_encrypt:
+
+       testq   %rdx,%rdx
+       jz      L$cbc_ret
+
+       movl    240(%rcx),%r10d
+       movq    %rcx,%r11
+       testl   %r9d,%r9d
+       jz      L$cbc_decrypt
+
+       movups  (%r8),%xmm2
+       movl    %r10d,%eax
+       cmpq    $16,%rdx
+       jb      L$cbc_enc_tail
+       subq    $16,%rdx
+       jmp     L$cbc_enc_loop
+.p2align       4
+L$cbc_enc_loop:
+       movups  (%rdi),%xmm3
+       leaq    16(%rdi),%rdi
+
+       movups  (%rcx),%xmm0
+       movups  16(%rcx),%xmm1
+       xorps   %xmm0,%xmm3
+       leaq    32(%rcx),%rcx
+       xorps   %xmm3,%xmm2
+L$oop_enc1_15:
+.byte  102,15,56,220,209
+       decl    %eax
+       movups  (%rcx),%xmm1
+       leaq    16(%rcx),%rcx
+       jnz     L$oop_enc1_15
+.byte  102,15,56,221,209
+       movl    %r10d,%eax
+       movq    %r11,%rcx
+       movups  %xmm2,0(%rsi)
+       leaq    16(%rsi),%rsi
+       subq    $16,%rdx
+       jnc     L$cbc_enc_loop
+       addq    $16,%rdx
+       jnz     L$cbc_enc_tail
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       movups  %xmm2,(%r8)
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       jmp     L$cbc_ret
+
+L$cbc_enc_tail:
+       movq    %rdx,%rcx
+       xchgq   %rdi,%rsi
+.long  0x9066A4F3
+       movl    $16,%ecx
+       subq    %rdx,%rcx
+       xorl    %eax,%eax
+.long  0x9066AAF3
+       leaq    -16(%rdi),%rdi
+       movl    %r10d,%eax
+       movq    %rdi,%rsi
+       movq    %r11,%rcx
+       xorq    %rdx,%rdx
+       jmp     L$cbc_enc_loop
+
+.p2align       4
+L$cbc_decrypt:
+       cmpq    $16,%rdx
+       jne     L$cbc_decrypt_bulk
+
+
+
+       movdqu  (%rdi),%xmm2
+       movdqu  (%r8),%xmm3
+       movdqa  %xmm2,%xmm4
+       movups  (%rcx),%xmm0
+       movups  16(%rcx),%xmm1
+       leaq    32(%rcx),%rcx
+       xorps   %xmm0,%xmm2
+L$oop_dec1_16:
+.byte  102,15,56,222,209
+       decl    %r10d
+       movups  (%rcx),%xmm1
+       leaq    16(%rcx),%rcx
+       jnz     L$oop_dec1_16
+.byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       movdqu  %xmm4,(%r8)
+       xorps   %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
+       movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
+       jmp     L$cbc_ret
+.p2align       4
+L$cbc_decrypt_bulk:
+       leaq    (%rsp),%r11
+
+       pushq   %rbp
+
+       subq    $16,%rsp
+       andq    $-16,%rsp
+       movq    %rcx,%rbp
+       movups  (%r8),%xmm10
+       movl    %r10d,%eax
+       cmpq    $0x50,%rdx
+       jbe     L$cbc_dec_tail
+
+       movups  (%rcx),%xmm0
+       movdqu  0(%rdi),%xmm2
+       movdqu  16(%rdi),%xmm3
+       movdqa  %xmm2,%xmm11
+       movdqu  32(%rdi),%xmm4
+       movdqa  %xmm3,%xmm12
+       movdqu  48(%rdi),%xmm5
+       movdqa  %xmm4,%xmm13
+       movdqu  64(%rdi),%xmm6
+       movdqa  %xmm5,%xmm14
+       movdqu  80(%rdi),%xmm7
+       movdqa  %xmm6,%xmm15
+       movl    __gnutls_x86_cpuid_s+4(%rip),%r9d
+       cmpq    $0x70,%rdx
+       jbe     L$cbc_dec_six_or_seven
+
+       andl    $71303168,%r9d
+       subq    $0x50,%rdx
+       cmpl    $4194304,%r9d
+       je      L$cbc_dec_loop6_enter
+       subq    $0x20,%rdx
+       leaq    112(%rcx),%rcx
+       jmp     L$cbc_dec_loop8_enter
+.p2align       4
+L$cbc_dec_loop8:
+       movups  %xmm9,(%rsi)
+       leaq    16(%rsi),%rsi
+L$cbc_dec_loop8_enter:
+       movdqu  96(%rdi),%xmm8
+       pxor    %xmm0,%xmm2
+       movdqu  112(%rdi),%xmm9
+       pxor    %xmm0,%xmm3
+       movups  16-112(%rcx),%xmm1
+       pxor    %xmm0,%xmm4
+       movq    $-1,%rbp
+       cmpq    $0x70,%rdx
+       pxor    %xmm0,%xmm5
+       pxor    %xmm0,%xmm6
+       pxor    %xmm0,%xmm7
+       pxor    %xmm0,%xmm8
+
+.byte  102,15,56,222,209
+       pxor    %xmm0,%xmm9
+       movups  32-112(%rcx),%xmm0
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+.byte  102,15,56,222,241
+.byte  102,15,56,222,249
+.byte  102,68,15,56,222,193
+       adcq    $0,%rbp
+       andq    $128,%rbp
+.byte  102,68,15,56,222,201
+       addq    %rdi,%rbp
+       movups  48-112(%rcx),%xmm1
 .byte  102,15,56,222,208
 .byte  102,15,56,222,216
 .byte  102,15,56,222,224
@@ -2867,18 +3753,18 @@ L$cbc_dec_done:
        movdqu  112(%rdi),%xmm0
 .byte  102,65,15,56,223,228
        leaq    128(%rdi),%rdi
-       movdqu  0(%r11),%xmm11
+       movdqu  0(%rbp),%xmm11
 .byte  102,65,15,56,223,237
 .byte  102,65,15,56,223,246
-       movdqu  16(%r11),%xmm12
-       movdqu  32(%r11),%xmm13
+       movdqu  16(%rbp),%xmm12
+       movdqu  32(%rbp),%xmm13
 .byte  102,65,15,56,223,255
 .byte  102,68,15,56,223,193
-       movdqu  48(%r11),%xmm14
-       movdqu  64(%r11),%xmm15
+       movdqu  48(%rbp),%xmm14
+       movdqu  64(%rbp),%xmm15
 .byte  102,69,15,56,223,202
        movdqa  %xmm0,%xmm10
-       movdqu  80(%r11),%xmm1
+       movdqu  80(%rbp),%xmm1
        movups  -112(%rcx),%xmm0
 
        movups  %xmm2,(%rsi)
@@ -2997,7 +3883,7 @@ L$cbc_dec_loop6_enter:
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
        pxor    %xmm14,%xmm6
-       movq    %r11,%rcx
+       movq    %rbp,%rcx
        movdqu  %xmm5,48(%rsi)
        pxor    %xmm15,%xmm7
        movl    %r10d,%eax
@@ -3150,16 +4036,21 @@ L$cbc_dec_tail_partial:
 L$cbc_dec_ret:
        xorps   %xmm0,%xmm0
        pxor    %xmm1,%xmm1
-       leaq    (%rbp),%rsp
-       popq    %rbp
+       movq    -8(%r11),%rbp
+
+       leaq    (%r11),%rsp
+
 L$cbc_ret:
        .byte   0xf3,0xc3
 
+
 .globl _aesni_set_decrypt_key
 
 .p2align       4
 _aesni_set_decrypt_key:
+
 .byte  0x48,0x83,0xEC,0x08
+
        call    __aesni_set_encrypt_key
        shll    $4,%esi
        testl   %eax,%eax
@@ -3192,7 +4083,9 @@ L$dec_key_inverse:
        pxor    %xmm0,%xmm0
 L$dec_key_ret:
        addq    $8,%rsp
+
        .byte   0xf3,0xc3
+
 L$SEH_end_set_decrypt_key:
 
 .globl _aesni_set_encrypt_key
@@ -3200,7 +4093,9 @@ L$SEH_end_set_decrypt_key:
 .p2align       4
 _aesni_set_encrypt_key:
 __aesni_set_encrypt_key:
+
 .byte  0x48,0x83,0xEC,0x08
+
        movq    $-1,%rax
        testq   %rdi,%rdi
        jz      L$enc_key_ret
@@ -3493,7 +4388,9 @@ L$enc_key_ret:
        pxor    %xmm4,%xmm4
        pxor    %xmm5,%xmm5
        addq    $8,%rsp
+
        .byte   0xf3,0xc3
+
 L$SEH_end_set_encrypt_key:
 
 .p2align       4
index a8371ab56e81fca5f897d7238e2d7d53d7bb10aa..bd8e443fa9e386a2adb76baee8c9db28f8fc3642 100644 (file)
@@ -21,7 +21,6 @@
 #
 # *** This file is auto-generated ***
 #
-.file  "devel/perlasm/cpuid-x86.s"
 .text
 .globl _gnutls_cpuid
 .align 4
index 8fe772fd35ba6d840de54fd9ba01522c58862bf1..5fd3216755b0d823b774164f4373ae0e6be3f521 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 
 .p2align       4
 _gcm_gmult_4bit:
+
        pushq   %rbx
+
        pushq   %rbp
+
        pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       pushq   %r15
+
+       subq    $280,%rsp
+
 L$gmult_prologue:
 
        movzbq  15(%rdi),%r8
@@ -123,22 +135,35 @@ L$break1:
        movq    %r8,8(%rdi)
        movq    %r9,(%rdi)
 
-       movq    16(%rsp),%rbx
-       leaq    24(%rsp),%rsp
+       leaq    280+48(%rsp),%rsi
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
 L$gmult_epilogue:
        .byte   0xf3,0xc3
 
+
 .globl _gcm_ghash_4bit
 
 .p2align       4
 _gcm_ghash_4bit:
+
        pushq   %rbx
+
        pushq   %rbp
+
        pushq   %r12
+
        pushq   %r13
+
        pushq   %r14
+
        pushq   %r15
+
        subq    $280,%rsp
+
 L$ghash_prologue:
        movq    %rdx,%r14
        movq    %rcx,%r15
@@ -683,21 +708,31 @@ L$outer_loop:
        movq    %r8,8(%rdi)
        movq    %r9,(%rdi)
 
-       leaq    280(%rsp),%rsi
-       movq    0(%rsi),%r15
-       movq    8(%rsi),%r14
-       movq    16(%rsi),%r13
-       movq    24(%rsi),%r12
-       movq    32(%rsi),%rbp
-       movq    40(%rsi),%rbx
-       leaq    48(%rsi),%rsp
+       leaq    280+48(%rsp),%rsi
+
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    0(%rsi),%rsp
+
 L$ghash_epilogue:
        .byte   0xf3,0xc3
 
+
 .globl _gcm_init_clmul
 
 .p2align       4
 _gcm_init_clmul:
+
 L$_init_clmul:
        movdqu  (%rsi),%xmm2
        pshufd  $78,%xmm2,%xmm2
@@ -850,10 +885,12 @@ L$_init_clmul:
        movdqu  %xmm4,80(%rdi)
        .byte   0xf3,0xc3
 
+
 .globl _gcm_gmult_clmul
 
 .p2align       4
 _gcm_gmult_clmul:
+
 L$_gmult_clmul:
        movdqu  (%rdi),%xmm0
        movdqa  L$bswap_mask(%rip),%xmm5
@@ -901,10 +938,12 @@ L$_gmult_clmul:
        movdqu  %xmm0,(%rdi)
        .byte   0xf3,0xc3
 
+
 .globl _gcm_ghash_clmul
 
 .p2align       5
 _gcm_ghash_clmul:
+
 L$_ghash_clmul:
        movdqa  L$bswap_mask(%rip),%xmm10
 
@@ -1284,10 +1323,12 @@ L$done:
        movdqu  %xmm0,(%rdi)
        .byte   0xf3,0xc3
 
+
 .globl _gcm_init_avx
 
 .p2align       5
 _gcm_init_avx:
+
        vzeroupper
 
        vmovdqu (%rsi),%xmm2
@@ -1391,16 +1432,20 @@ L$init_start_avx:
        vzeroupper
        .byte   0xf3,0xc3
 
+
 .globl _gcm_gmult_avx
 
 .p2align       5
 _gcm_gmult_avx:
+
        jmp     L$_gmult_clmul
 
+
 .globl _gcm_ghash_avx
 
 .p2align       5
 _gcm_ghash_avx:
+
        vzeroupper
 
        vmovdqu (%rdi),%xmm10
@@ -1773,6 +1818,7 @@ L$tail_no_xor_avx:
        vzeroupper
        .byte   0xf3,0xc3
 
+
 .p2align       6
 L$bswap_mask:
 .byte  15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
index 8e01010ce3db7cd58084624768a9916598e2d68c..985d4af8dbb4bf83f95ce0bde34107ae87770c8a 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,6 @@
 #
 # *** This file is auto-generated ***
 #
-.file  "sha1-586.s"
 .text
 .globl _sha1_block_data_order
 .align 4
index 79c10de2ed5587e40e2f4f752d9c0a6e2c4ad176..a576acc25fb030c7002c71ddf3c66910ef84fa47 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 
 .p2align       4
 _sha1_block_data_order:
+
        movl    __gnutls_x86_cpuid_s+0(%rip),%r9d
        movl    __gnutls_x86_cpuid_s+4(%rip),%r8d
+       movl    __gnutls_x86_cpuid_s+8(%rip),%r10d
        testl   $512,%r8d
        jz      L$ialu
+       testl   $536870912,%r10d
+       jnz     _shaext_shortcut
+       andl    $296,%r10d
+       cmpl    $296,%r10d
+       je      _avx2_shortcut
+       andl    $268435456,%r8d
+       andl    $1073741824,%r9d
+       orl     %r9d,%r8d
+       cmpl    $1342177280,%r8d
+       je      _avx_shortcut
        jmp     _ssse3_shortcut
 
 .p2align       4
 L$ialu:
+       movq    %rsp,%rax
+
        pushq   %rbx
+
        pushq   %rbp
+
        pushq   %r12
+
        pushq   %r13
-       movq    %rsp,%r11
+
+       pushq   %r14
+
        movq    %rdi,%r8
        subq    $72,%rsp
        movq    %rsi,%r9
        andq    $-64,%rsp
        movq    %rdx,%r10
-       movq    %r11,64(%rsp)
+       movq    %rax,64(%rsp)
+
 L$prologue:
 
        movl    0(%r8),%esi
@@ -76,1230 +96,1168 @@ L$prologue:
 L$loop:
        movl    0(%r9),%edx
        bswapl  %edx
-       movl    %edx,0(%rsp)
-       movl    %r11d,%eax
        movl    4(%r9),%ebp
+       movl    %r12d,%eax
+       movl    %edx,0(%rsp)
        movl    %esi,%ecx
-       xorl    %r12d,%eax
        bswapl  %ebp
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%r13,1),%r13d
        andl    %edi,%eax
-       movl    %ebp,4(%rsp)
+       leal    1518500249(%rdx,%r13,1),%r13d
        addl    %ecx,%r13d
        xorl    %r12d,%eax
        roll    $30,%edi
        addl    %eax,%r13d
-       movl    %edi,%eax
-       movl    8(%r9),%edx
+       movl    8(%r9),%r14d
+       movl    %r11d,%eax
+       movl    %ebp,4(%rsp)
        movl    %r13d,%ecx
-       xorl    %r11d,%eax
-       bswapl  %edx
+       bswapl  %r14d
+       xorl    %edi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%r12,1),%r12d
        andl    %esi,%eax
-       movl    %edx,8(%rsp)
+       leal    1518500249(%rbp,%r12,1),%r12d
        addl    %ecx,%r12d
        xorl    %r11d,%eax
        roll    $30,%esi
        addl    %eax,%r12d
-       movl    %esi,%eax
-       movl    12(%r9),%ebp
+       movl    12(%r9),%edx
+       movl    %edi,%eax
+       movl    %r14d,8(%rsp)
        movl    %r12d,%ecx
-       xorl    %edi,%eax
-       bswapl  %ebp
+       bswapl  %edx
+       xorl    %esi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%r11,1),%r11d
        andl    %r13d,%eax
-       movl    %ebp,12(%rsp)
+       leal    1518500249(%r14,%r11,1),%r11d
        addl    %ecx,%r11d
        xorl    %edi,%eax
        roll    $30,%r13d
        addl    %eax,%r11d
-       movl    %r13d,%eax
-       movl    16(%r9),%edx
+       movl    16(%r9),%ebp
+       movl    %esi,%eax
+       movl    %edx,12(%rsp)
        movl    %r11d,%ecx
-       xorl    %esi,%eax
-       bswapl  %edx
+       bswapl  %ebp
+       xorl    %r13d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%rdi,1),%edi
        andl    %r12d,%eax
-       movl    %edx,16(%rsp)
+       leal    1518500249(%rdx,%rdi,1),%edi
        addl    %ecx,%edi
        xorl    %esi,%eax
        roll    $30,%r12d
        addl    %eax,%edi
-       movl    %r12d,%eax
-       movl    20(%r9),%ebp
+       movl    20(%r9),%r14d
+       movl    %r13d,%eax
+       movl    %ebp,16(%rsp)
        movl    %edi,%ecx
-       xorl    %r13d,%eax
-       bswapl  %ebp
+       bswapl  %r14d
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%rsi,1),%esi
        andl    %r11d,%eax
-       movl    %ebp,20(%rsp)
+       leal    1518500249(%rbp,%rsi,1),%esi
        addl    %ecx,%esi
        xorl    %r13d,%eax
        roll    $30,%r11d
        addl    %eax,%esi
-       movl    %r11d,%eax
        movl    24(%r9),%edx
+       movl    %r12d,%eax
+       movl    %r14d,20(%rsp)
        movl    %esi,%ecx
-       xorl    %r12d,%eax
        bswapl  %edx
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%r13,1),%r13d
        andl    %edi,%eax
-       movl    %edx,24(%rsp)
+       leal    1518500249(%r14,%r13,1),%r13d
        addl    %ecx,%r13d
        xorl    %r12d,%eax
        roll    $30,%edi
        addl    %eax,%r13d
-       movl    %edi,%eax
        movl    28(%r9),%ebp
+       movl    %r11d,%eax
+       movl    %edx,24(%rsp)
        movl    %r13d,%ecx
-       xorl    %r11d,%eax
        bswapl  %ebp
+       xorl    %edi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%r12,1),%r12d
        andl    %esi,%eax
-       movl    %ebp,28(%rsp)
+       leal    1518500249(%rdx,%r12,1),%r12d
        addl    %ecx,%r12d
        xorl    %r11d,%eax
        roll    $30,%esi
        addl    %eax,%r12d
-       movl    %esi,%eax
-       movl    32(%r9),%edx
+       movl    32(%r9),%r14d
+       movl    %edi,%eax
+       movl    %ebp,28(%rsp)
        movl    %r12d,%ecx
-       xorl    %edi,%eax
-       bswapl  %edx
+       bswapl  %r14d
+       xorl    %esi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%r11,1),%r11d
        andl    %r13d,%eax
-       movl    %edx,32(%rsp)
+       leal    1518500249(%rbp,%r11,1),%r11d
        addl    %ecx,%r11d
        xorl    %edi,%eax
        roll    $30,%r13d
        addl    %eax,%r11d
-       movl    %r13d,%eax
-       movl    36(%r9),%ebp
+       movl    36(%r9),%edx
+       movl    %esi,%eax
+       movl    %r14d,32(%rsp)
        movl    %r11d,%ecx
-       xorl    %esi,%eax
-       bswapl  %ebp
+       bswapl  %edx
+       xorl    %r13d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%rdi,1),%edi
        andl    %r12d,%eax
-       movl    %ebp,36(%rsp)
+       leal    1518500249(%r14,%rdi,1),%edi
        addl    %ecx,%edi
        xorl    %esi,%eax
        roll    $30,%r12d
        addl    %eax,%edi
-       movl    %r12d,%eax
-       movl    40(%r9),%edx
+       movl    40(%r9),%ebp
+       movl    %r13d,%eax
+       movl    %edx,36(%rsp)
        movl    %edi,%ecx
-       xorl    %r13d,%eax
-       bswapl  %edx
+       bswapl  %ebp
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%rsi,1),%esi
        andl    %r11d,%eax
-       movl    %edx,40(%rsp)
+       leal    1518500249(%rdx,%rsi,1),%esi
        addl    %ecx,%esi
        xorl    %r13d,%eax
        roll    $30,%r11d
        addl    %eax,%esi
-       movl    %r11d,%eax
-       movl    44(%r9),%ebp
+       movl    44(%r9),%r14d
+       movl    %r12d,%eax
+       movl    %ebp,40(%rsp)
        movl    %esi,%ecx
-       xorl    %r12d,%eax
-       bswapl  %ebp
+       bswapl  %r14d
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%r13,1),%r13d
        andl    %edi,%eax
-       movl    %ebp,44(%rsp)
+       leal    1518500249(%rbp,%r13,1),%r13d
        addl    %ecx,%r13d
        xorl    %r12d,%eax
        roll    $30,%edi
        addl    %eax,%r13d
-       movl    %edi,%eax
        movl    48(%r9),%edx
+       movl    %r11d,%eax
+       movl    %r14d,44(%rsp)
        movl    %r13d,%ecx
-       xorl    %r11d,%eax
        bswapl  %edx
+       xorl    %edi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%r12,1),%r12d
        andl    %esi,%eax
-       movl    %edx,48(%rsp)
+       leal    1518500249(%r14,%r12,1),%r12d
        addl    %ecx,%r12d
        xorl    %r11d,%eax
        roll    $30,%esi
        addl    %eax,%r12d
-       movl    %esi,%eax
        movl    52(%r9),%ebp
+       movl    %edi,%eax
+       movl    %edx,48(%rsp)
        movl    %r12d,%ecx
-       xorl    %edi,%eax
        bswapl  %ebp
+       xorl    %esi,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%r11,1),%r11d
        andl    %r13d,%eax
-       movl    %ebp,52(%rsp)
+       leal    1518500249(%rdx,%r11,1),%r11d
        addl    %ecx,%r11d
        xorl    %edi,%eax
        roll    $30,%r13d
        addl    %eax,%r11d
-       movl    %r13d,%eax
-       movl    56(%r9),%edx
+       movl    56(%r9),%r14d
+       movl    %esi,%eax
+       movl    %ebp,52(%rsp)
        movl    %r11d,%ecx
-       xorl    %esi,%eax
-       bswapl  %edx
+       bswapl  %r14d
+       xorl    %r13d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rbp,%rdi,1),%edi
        andl    %r12d,%eax
-       movl    %edx,56(%rsp)
+       leal    1518500249(%rbp,%rdi,1),%edi
        addl    %ecx,%edi
        xorl    %esi,%eax
        roll    $30,%r12d
        addl    %eax,%edi
-       movl    %r12d,%eax
-       movl    60(%r9),%ebp
+       movl    60(%r9),%edx
+       movl    %r13d,%eax
+       movl    %r14d,56(%rsp)
        movl    %edi,%ecx
-       xorl    %r13d,%eax
-       bswapl  %ebp
+       bswapl  %edx
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    1518500249(%rdx,%rsi,1),%esi
        andl    %r11d,%eax
-       movl    %ebp,60(%rsp)
+       leal    1518500249(%r14,%rsi,1),%esi
        addl    %ecx,%esi
        xorl    %r13d,%eax
        roll    $30,%r11d
        addl    %eax,%esi
-       movl    0(%rsp),%edx
-       movl    %r11d,%eax
+       xorl    0(%rsp),%ebp
+       movl    %r12d,%eax
+       movl    %edx,60(%rsp)
        movl    %esi,%ecx
-       xorl    8(%rsp),%edx
-       xorl    %r12d,%eax
+       xorl    8(%rsp),%ebp
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       xorl    32(%rsp),%edx
+       xorl    32(%rsp),%ebp
        andl    %edi,%eax
-       leal    1518500249(%rbp,%r13,1),%r13d
-       xorl    52(%rsp),%edx
+       leal    1518500249(%rdx,%r13,1),%r13d
+       roll    $30,%edi
        xorl    %r12d,%eax
-       roll    $1,%edx
        addl    %ecx,%r13d
-       roll    $30,%edi
-       movl    %edx,0(%rsp)
+       roll    $1,%ebp
        addl    %eax,%r13d
-       movl    4(%rsp),%ebp
-       movl    %edi,%eax
+       xorl    4(%rsp),%r14d
+       movl    %r11d,%eax
+       movl    %ebp,0(%rsp)
        movl    %r13d,%ecx
-       xorl    12(%rsp),%ebp
-       xorl    %r11d,%eax
+       xorl    12(%rsp),%r14d
+       xorl    %edi,%eax
        roll    $5,%ecx
-       xorl    36(%rsp),%ebp
+       xorl    36(%rsp),%r14d
        andl    %esi,%eax
-       leal    1518500249(%rdx,%r12,1),%r12d
-       xorl    56(%rsp),%ebp
+       leal    1518500249(%rbp,%r12,1),%r12d
+       roll    $30,%esi
        xorl    %r11d,%eax
-       roll    $1,%ebp
        addl    %ecx,%r12d
-       roll    $30,%esi
-       movl    %ebp,4(%rsp)
+       roll    $1,%r14d
        addl    %eax,%r12d
-       movl    8(%rsp),%edx
-       movl    %esi,%eax
+       xorl    8(%rsp),%edx
+       movl    %edi,%eax
+       movl    %r14d,4(%rsp)
        movl    %r12d,%ecx
        xorl    16(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    %esi,%eax
        roll    $5,%ecx
        xorl    40(%rsp),%edx
        andl    %r13d,%eax
-       leal    1518500249(%rbp,%r11,1),%r11d
-       xorl    60(%rsp),%edx
+       leal    1518500249(%r14,%r11,1),%r11d
+       roll    $30,%r13d
        xorl    %edi,%eax
-       roll    $1,%edx
        addl    %ecx,%r11d
-       roll    $30,%r13d
-       movl    %edx,8(%rsp)
+       roll    $1,%edx
        addl    %eax,%r11d
-       movl    12(%rsp),%ebp
-       movl    %r13d,%eax
+       xorl    12(%rsp),%ebp
+       movl    %esi,%eax
+       movl    %edx,8(%rsp)
        movl    %r11d,%ecx
        xorl    20(%rsp),%ebp
-       xorl    %esi,%eax
+       xorl    %r13d,%eax
        roll    $5,%ecx
        xorl    44(%rsp),%ebp
        andl    %r12d,%eax
        leal    1518500249(%rdx,%rdi,1),%edi
-       xorl    0(%rsp),%ebp
+       roll    $30,%r12d
        xorl    %esi,%eax
-       roll    $1,%ebp
        addl    %ecx,%edi
-       roll    $30,%r12d
-       movl    %ebp,12(%rsp)
+       roll    $1,%ebp
        addl    %eax,%edi
-       movl    16(%rsp),%edx
-       movl    %r12d,%eax
+       xorl    16(%rsp),%r14d
+       movl    %r13d,%eax
+       movl    %ebp,12(%rsp)
        movl    %edi,%ecx
-       xorl    24(%rsp),%edx
-       xorl    %r13d,%eax
+       xorl    24(%rsp),%r14d
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       xorl    48(%rsp),%edx
+       xorl    48(%rsp),%r14d
        andl    %r11d,%eax
        leal    1518500249(%rbp,%rsi,1),%esi
-       xorl    4(%rsp),%edx
+       roll    $30,%r11d
        xorl    %r13d,%eax
-       roll    $1,%edx
        addl    %ecx,%esi
-       roll    $30,%r11d
-       movl    %edx,16(%rsp)
+       roll    $1,%r14d
        addl    %eax,%esi
-       movl    20(%rsp),%ebp
-       movl    %r11d,%eax
+       xorl    20(%rsp),%edx
+       movl    %edi,%eax
+       movl    %r14d,16(%rsp)
        movl    %esi,%ecx
-       xorl    28(%rsp),%ebp
-       xorl    %edi,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%r13,1),%r13d
-       xorl    52(%rsp),%ebp
+       xorl    28(%rsp),%edx
        xorl    %r12d,%eax
+       roll    $5,%ecx
+       xorl    52(%rsp),%edx
+       leal    1859775393(%r14,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    8(%rsp),%ebp
        roll    $30,%edi
        addl    %eax,%r13d
-       roll    $1,%ebp
-       movl    %ebp,20(%rsp)
-       movl    24(%rsp),%edx
-       movl    %edi,%eax
+       roll    $1,%edx
+       xorl    24(%rsp),%ebp
+       movl    %esi,%eax
+       movl    %edx,20(%rsp)
        movl    %r13d,%ecx
-       xorl    32(%rsp),%edx
-       xorl    %esi,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rbp,%r12,1),%r12d
-       xorl    56(%rsp),%edx
+       xorl    32(%rsp),%ebp
        xorl    %r11d,%eax
+       roll    $5,%ecx
+       xorl    56(%rsp),%ebp
+       leal    1859775393(%rdx,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    12(%rsp),%edx
        roll    $30,%esi
        addl    %eax,%r12d
-       roll    $1,%edx
-       movl    %edx,24(%rsp)
-       movl    28(%rsp),%ebp
-       movl    %esi,%eax
+       roll    $1,%ebp
+       xorl    28(%rsp),%r14d
+       movl    %r13d,%eax
+       movl    %ebp,24(%rsp)
        movl    %r12d,%ecx
-       xorl    36(%rsp),%ebp
-       xorl    %r13d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%r11,1),%r11d
-       xorl    60(%rsp),%ebp
+       xorl    36(%rsp),%r14d
        xorl    %edi,%eax
+       roll    $5,%ecx
+       xorl    60(%rsp),%r14d
+       leal    1859775393(%rbp,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    16(%rsp),%ebp
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%ebp
-       movl    %ebp,28(%rsp)
-       movl    32(%rsp),%edx
-       movl    %r13d,%eax
+       roll    $1,%r14d
+       xorl    32(%rsp),%edx
+       movl    %r12d,%eax
+       movl    %r14d,28(%rsp)
        movl    %r11d,%ecx
        xorl    40(%rsp),%edx
-       xorl    %r12d,%eax
+       xorl    %esi,%eax
        roll    $5,%ecx
-       leal    1859775393(%rbp,%rdi,1),%edi
        xorl    0(%rsp),%edx
-       xorl    %esi,%eax
+       leal    1859775393(%r14,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    20(%rsp),%edx
        roll    $30,%r12d
        addl    %eax,%edi
        roll    $1,%edx
+       xorl    36(%rsp),%ebp
+       movl    %r11d,%eax
        movl    %edx,32(%rsp)
-       movl    36(%rsp),%ebp
-       movl    %r12d,%eax
        movl    %edi,%ecx
        xorl    44(%rsp),%ebp
-       xorl    %r11d,%eax
+       xorl    %r13d,%eax
        roll    $5,%ecx
-       leal    1859775393(%rdx,%rsi,1),%esi
        xorl    4(%rsp),%ebp
-       xorl    %r13d,%eax
+       leal    1859775393(%rdx,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    24(%rsp),%ebp
        roll    $30,%r11d
        addl    %eax,%esi
        roll    $1,%ebp
+       xorl    40(%rsp),%r14d
+       movl    %edi,%eax
        movl    %ebp,36(%rsp)
-       movl    40(%rsp),%edx
-       movl    %r11d,%eax
        movl    %esi,%ecx
-       xorl    48(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    48(%rsp),%r14d
+       xorl    %r12d,%eax
        roll    $5,%ecx
+       xorl    8(%rsp),%r14d
        leal    1859775393(%rbp,%r13,1),%r13d
-       xorl    8(%rsp),%edx
-       xorl    %r12d,%eax
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    28(%rsp),%edx
        roll    $30,%edi
        addl    %eax,%r13d
-       roll    $1,%edx
-       movl    %edx,40(%rsp)
-       movl    44(%rsp),%ebp
-       movl    %edi,%eax
+       roll    $1,%r14d
+       xorl    44(%rsp),%edx
+       movl    %esi,%eax
+       movl    %r14d,40(%rsp)
        movl    %r13d,%ecx
-       xorl    52(%rsp),%ebp
-       xorl    %esi,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%r12,1),%r12d
-       xorl    12(%rsp),%ebp
+       xorl    52(%rsp),%edx
        xorl    %r11d,%eax
+       roll    $5,%ecx
+       xorl    12(%rsp),%edx
+       leal    1859775393(%r14,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    32(%rsp),%ebp
        roll    $30,%esi
        addl    %eax,%r12d
-       roll    $1,%ebp
-       movl    %ebp,44(%rsp)
-       movl    48(%rsp),%edx
-       movl    %esi,%eax
+       roll    $1,%edx
+       xorl    48(%rsp),%ebp
+       movl    %r13d,%eax
+       movl    %edx,44(%rsp)
        movl    %r12d,%ecx
-       xorl    56(%rsp),%edx
-       xorl    %r13d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rbp,%r11,1),%r11d
-       xorl    16(%rsp),%edx
+       xorl    56(%rsp),%ebp
        xorl    %edi,%eax
+       roll    $5,%ecx
+       xorl    16(%rsp),%ebp
+       leal    1859775393(%rdx,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    36(%rsp),%edx
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%edx
-       movl    %edx,48(%rsp)
-       movl    52(%rsp),%ebp
-       movl    %r13d,%eax
+       roll    $1,%ebp
+       xorl    52(%rsp),%r14d
+       movl    %r12d,%eax
+       movl    %ebp,48(%rsp)
        movl    %r11d,%ecx
-       xorl    60(%rsp),%ebp
-       xorl    %r12d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%rdi,1),%edi
-       xorl    20(%rsp),%ebp
+       xorl    60(%rsp),%r14d
        xorl    %esi,%eax
+       roll    $5,%ecx
+       xorl    20(%rsp),%r14d
+       leal    1859775393(%rbp,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    40(%rsp),%ebp
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%ebp
-       movl    %ebp,52(%rsp)
-       movl    56(%rsp),%edx
-       movl    %r12d,%eax
+       roll    $1,%r14d
+       xorl    56(%rsp),%edx
+       movl    %r11d,%eax
+       movl    %r14d,52(%rsp)
        movl    %edi,%ecx
        xorl    0(%rsp),%edx
-       xorl    %r11d,%eax
+       xorl    %r13d,%eax
        roll    $5,%ecx
-       leal    1859775393(%rbp,%rsi,1),%esi
        xorl    24(%rsp),%edx
-       xorl    %r13d,%eax
+       leal    1859775393(%r14,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    44(%rsp),%edx
        roll    $30,%r11d
        addl    %eax,%esi
        roll    $1,%edx
+       xorl    60(%rsp),%ebp
+       movl    %edi,%eax
        movl    %edx,56(%rsp)
-       movl    60(%rsp),%ebp
-       movl    %r11d,%eax
        movl    %esi,%ecx
        xorl    4(%rsp),%ebp
-       xorl    %edi,%eax
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    1859775393(%rdx,%r13,1),%r13d
        xorl    28(%rsp),%ebp
-       xorl    %r12d,%eax
+       leal    1859775393(%rdx,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    48(%rsp),%ebp
        roll    $30,%edi
        addl    %eax,%r13d
        roll    $1,%ebp
+       xorl    0(%rsp),%r14d
+       movl    %esi,%eax
        movl    %ebp,60(%rsp)
-       movl    0(%rsp),%edx
-       movl    %edi,%eax
        movl    %r13d,%ecx
-       xorl    8(%rsp),%edx
-       xorl    %esi,%eax
+       xorl    8(%rsp),%r14d
+       xorl    %r11d,%eax
        roll    $5,%ecx
+       xorl    32(%rsp),%r14d
        leal    1859775393(%rbp,%r12,1),%r12d
-       xorl    32(%rsp),%edx
-       xorl    %r11d,%eax
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    52(%rsp),%edx
        roll    $30,%esi
        addl    %eax,%r12d
-       roll    $1,%edx
-       movl    %edx,0(%rsp)
-       movl    4(%rsp),%ebp
-       movl    %esi,%eax
+       roll    $1,%r14d
+       xorl    4(%rsp),%edx
+       movl    %r13d,%eax
+       movl    %r14d,0(%rsp)
        movl    %r12d,%ecx
-       xorl    12(%rsp),%ebp
-       xorl    %r13d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%r11,1),%r11d
-       xorl    36(%rsp),%ebp
+       xorl    12(%rsp),%edx
        xorl    %edi,%eax
+       roll    $5,%ecx
+       xorl    36(%rsp),%edx
+       leal    1859775393(%r14,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    56(%rsp),%ebp
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%ebp
-       movl    %ebp,4(%rsp)
-       movl    8(%rsp),%edx
-       movl    %r13d,%eax
+       roll    $1,%edx
+       xorl    8(%rsp),%ebp
+       movl    %r12d,%eax
+       movl    %edx,4(%rsp)
        movl    %r11d,%ecx
-       xorl    16(%rsp),%edx
-       xorl    %r12d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rbp,%rdi,1),%edi
-       xorl    40(%rsp),%edx
+       xorl    16(%rsp),%ebp
        xorl    %esi,%eax
+       roll    $5,%ecx
+       xorl    40(%rsp),%ebp
+       leal    1859775393(%rdx,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    60(%rsp),%edx
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%edx
-       movl    %edx,8(%rsp)
-       movl    12(%rsp),%ebp
-       movl    %r12d,%eax
+       roll    $1,%ebp
+       xorl    12(%rsp),%r14d
+       movl    %r11d,%eax
+       movl    %ebp,8(%rsp)
        movl    %edi,%ecx
-       xorl    20(%rsp),%ebp
-       xorl    %r11d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%rsi,1),%esi
-       xorl    44(%rsp),%ebp
+       xorl    20(%rsp),%r14d
        xorl    %r13d,%eax
+       roll    $5,%ecx
+       xorl    44(%rsp),%r14d
+       leal    1859775393(%rbp,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    0(%rsp),%ebp
        roll    $30,%r11d
        addl    %eax,%esi
-       roll    $1,%ebp
-       movl    %ebp,12(%rsp)
-       movl    16(%rsp),%edx
-       movl    %r11d,%eax
+       roll    $1,%r14d
+       xorl    16(%rsp),%edx
+       movl    %edi,%eax
+       movl    %r14d,12(%rsp)
        movl    %esi,%ecx
        xorl    24(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    1859775393(%rbp,%r13,1),%r13d
        xorl    48(%rsp),%edx
-       xorl    %r12d,%eax
+       leal    1859775393(%r14,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    4(%rsp),%edx
        roll    $30,%edi
        addl    %eax,%r13d
        roll    $1,%edx
+       xorl    20(%rsp),%ebp
+       movl    %esi,%eax
        movl    %edx,16(%rsp)
-       movl    20(%rsp),%ebp
-       movl    %edi,%eax
        movl    %r13d,%ecx
        xorl    28(%rsp),%ebp
-       xorl    %esi,%eax
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    1859775393(%rdx,%r12,1),%r12d
        xorl    52(%rsp),%ebp
-       xorl    %r11d,%eax
+       leal    1859775393(%rdx,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    8(%rsp),%ebp
        roll    $30,%esi
        addl    %eax,%r12d
        roll    $1,%ebp
+       xorl    24(%rsp),%r14d
+       movl    %r13d,%eax
        movl    %ebp,20(%rsp)
-       movl    24(%rsp),%edx
-       movl    %esi,%eax
        movl    %r12d,%ecx
-       xorl    32(%rsp),%edx
-       xorl    %r13d,%eax
+       xorl    32(%rsp),%r14d
+       xorl    %edi,%eax
        roll    $5,%ecx
+       xorl    56(%rsp),%r14d
        leal    1859775393(%rbp,%r11,1),%r11d
-       xorl    56(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    12(%rsp),%edx
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%edx
-       movl    %edx,24(%rsp)
-       movl    28(%rsp),%ebp
-       movl    %r13d,%eax
+       roll    $1,%r14d
+       xorl    28(%rsp),%edx
+       movl    %r12d,%eax
+       movl    %r14d,24(%rsp)
        movl    %r11d,%ecx
-       xorl    36(%rsp),%ebp
-       xorl    %r12d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rdx,%rdi,1),%edi
-       xorl    60(%rsp),%ebp
+       xorl    36(%rsp),%edx
        xorl    %esi,%eax
+       roll    $5,%ecx
+       xorl    60(%rsp),%edx
+       leal    1859775393(%r14,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    16(%rsp),%ebp
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%ebp
-       movl    %ebp,28(%rsp)
-       movl    32(%rsp),%edx
-       movl    %r12d,%eax
+       roll    $1,%edx
+       xorl    32(%rsp),%ebp
+       movl    %r11d,%eax
+       movl    %edx,28(%rsp)
        movl    %edi,%ecx
-       xorl    40(%rsp),%edx
-       xorl    %r11d,%eax
-       roll    $5,%ecx
-       leal    1859775393(%rbp,%rsi,1),%esi
-       xorl    0(%rsp),%edx
+       xorl    40(%rsp),%ebp
        xorl    %r13d,%eax
+       roll    $5,%ecx
+       xorl    0(%rsp),%ebp
+       leal    1859775393(%rdx,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    20(%rsp),%edx
        roll    $30,%r11d
        addl    %eax,%esi
-       roll    $1,%edx
-       movl    %edx,32(%rsp)
-       movl    36(%rsp),%ebp
-       movl    %r11d,%eax
-       movl    %r11d,%ebx
-       xorl    44(%rsp),%ebp
-       andl    %r12d,%eax
+       roll    $1,%ebp
+       xorl    36(%rsp),%r14d
+       movl    %r12d,%eax
+       movl    %ebp,32(%rsp)
+       movl    %r12d,%ebx
+       xorl    44(%rsp),%r14d
+       andl    %r11d,%eax
        movl    %esi,%ecx
-       xorl    4(%rsp),%ebp
-       xorl    %r12d,%ebx
-       leal    -1894007588(%rdx,%r13,1),%r13d
+       xorl    4(%rsp),%r14d
+       leal    -1894007588(%rbp,%r13,1),%r13d
+       xorl    %r11d,%ebx
        roll    $5,%ecx
-       xorl    24(%rsp),%ebp
        addl    %eax,%r13d
+       roll    $1,%r14d
        andl    %edi,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%r13d
-       roll    $30,%edi
-       movl    %ebp,36(%rsp)
        addl    %ecx,%r13d
-       movl    40(%rsp),%edx
-       movl    %edi,%eax
-       movl    %edi,%ebx
+       roll    $30,%edi
+       addl    %ebx,%r13d
+       xorl    40(%rsp),%edx
+       movl    %r11d,%eax
+       movl    %r14d,36(%rsp)
+       movl    %r11d,%ebx
        xorl    48(%rsp),%edx
-       andl    %r11d,%eax
+       andl    %edi,%eax
        movl    %r13d,%ecx
        xorl    8(%rsp),%edx
-       xorl    %r11d,%ebx
-       leal    -1894007588(%rbp,%r12,1),%r12d
+       leal    -1894007588(%r14,%r12,1),%r12d
+       xorl    %edi,%ebx
        roll    $5,%ecx
-       xorl    28(%rsp),%edx
        addl    %eax,%r12d
-       andl    %esi,%ebx
        roll    $1,%edx
-       addl    %ebx,%r12d
+       andl    %esi,%ebx
+       addl    %ecx,%r12d
        roll    $30,%esi
+       addl    %ebx,%r12d
+       xorl    44(%rsp),%ebp
+       movl    %edi,%eax
        movl    %edx,40(%rsp)
-       addl    %ecx,%r12d
-       movl    44(%rsp),%ebp
-       movl    %esi,%eax
-       movl    %esi,%ebx
+       movl    %edi,%ebx
        xorl    52(%rsp),%ebp
-       andl    %edi,%eax
+       andl    %esi,%eax
        movl    %r12d,%ecx
        xorl    12(%rsp),%ebp
-       xorl    %edi,%ebx
        leal    -1894007588(%rdx,%r11,1),%r11d
+       xorl    %esi,%ebx
        roll    $5,%ecx
-       xorl    32(%rsp),%ebp
        addl    %eax,%r11d
-       andl    %r13d,%ebx
        roll    $1,%ebp
-       addl    %ebx,%r11d
+       andl    %r13d,%ebx
+       addl    %ecx,%r11d
        roll    $30,%r13d
+       addl    %ebx,%r11d
+       xorl    48(%rsp),%r14d
+       movl    %esi,%eax
        movl    %ebp,44(%rsp)
-       addl    %ecx,%r11d
-       movl    48(%rsp),%edx
-       movl    %r13d,%eax
-       movl    %r13d,%ebx
-       xorl    56(%rsp),%edx
-       andl    %esi,%eax
+       movl    %esi,%ebx
+       xorl    56(%rsp),%r14d
+       andl    %r13d,%eax
        movl    %r11d,%ecx
-       xorl    16(%rsp),%edx
-       xorl    %esi,%ebx
+       xorl    16(%rsp),%r14d
        leal    -1894007588(%rbp,%rdi,1),%edi
+       xorl    %r13d,%ebx
        roll    $5,%ecx
-       xorl    36(%rsp),%edx
        addl    %eax,%edi
+       roll    $1,%r14d
        andl    %r12d,%ebx
-       roll    $1,%edx
-       addl    %ebx,%edi
-       roll    $30,%r12d
-       movl    %edx,48(%rsp)
        addl    %ecx,%edi
-       movl    52(%rsp),%ebp
-       movl    %r12d,%eax
-       movl    %r12d,%ebx
-       xorl    60(%rsp),%ebp
-       andl    %r13d,%eax
+       roll    $30,%r12d
+       addl    %ebx,%edi
+       xorl    52(%rsp),%edx
+       movl    %r13d,%eax
+       movl    %r14d,48(%rsp)
+       movl    %r13d,%ebx
+       xorl    60(%rsp),%edx
+       andl    %r12d,%eax
        movl    %edi,%ecx
-       xorl    20(%rsp),%ebp
-       xorl    %r13d,%ebx
-       leal    -1894007588(%rdx,%rsi,1),%esi
+       xorl    20(%rsp),%edx
+       leal    -1894007588(%r14,%rsi,1),%esi
+       xorl    %r12d,%ebx
        roll    $5,%ecx
-       xorl    40(%rsp),%ebp
        addl    %eax,%esi
+       roll    $1,%edx
        andl    %r11d,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%esi
-       roll    $30,%r11d
-       movl    %ebp,52(%rsp)
        addl    %ecx,%esi
-       movl    56(%rsp),%edx
-       movl    %r11d,%eax
-       movl    %r11d,%ebx
-       xorl    0(%rsp),%edx
-       andl    %r12d,%eax
+       roll    $30,%r11d
+       addl    %ebx,%esi
+       xorl    56(%rsp),%ebp
+       movl    %r12d,%eax
+       movl    %edx,52(%rsp)
+       movl    %r12d,%ebx
+       xorl    0(%rsp),%ebp
+       andl    %r11d,%eax
        movl    %esi,%ecx
-       xorl    24(%rsp),%edx
-       xorl    %r12d,%ebx
-       leal    -1894007588(%rbp,%r13,1),%r13d
+       xorl    24(%rsp),%ebp
+       leal    -1894007588(%rdx,%r13,1),%r13d
+       xorl    %r11d,%ebx
        roll    $5,%ecx
-       xorl    44(%rsp),%edx
        addl    %eax,%r13d
+       roll    $1,%ebp
        andl    %edi,%ebx
-       roll    $1,%edx
-       addl    %ebx,%r13d
-       roll    $30,%edi
-       movl    %edx,56(%rsp)
        addl    %ecx,%r13d
-       movl    60(%rsp),%ebp
-       movl    %edi,%eax
-       movl    %edi,%ebx
-       xorl    4(%rsp),%ebp
-       andl    %r11d,%eax
+       roll    $30,%edi
+       addl    %ebx,%r13d
+       xorl    60(%rsp),%r14d
+       movl    %r11d,%eax
+       movl    %ebp,56(%rsp)
+       movl    %r11d,%ebx
+       xorl    4(%rsp),%r14d
+       andl    %edi,%eax
        movl    %r13d,%ecx
-       xorl    28(%rsp),%ebp
-       xorl    %r11d,%ebx
-       leal    -1894007588(%rdx,%r12,1),%r12d
+       xorl    28(%rsp),%r14d
+       leal    -1894007588(%rbp,%r12,1),%r12d
+       xorl    %edi,%ebx
        roll    $5,%ecx
-       xorl    48(%rsp),%ebp
        addl    %eax,%r12d
+       roll    $1,%r14d
        andl    %esi,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%r12d
-       roll    $30,%esi
-       movl    %ebp,60(%rsp)
        addl    %ecx,%r12d
-       movl    0(%rsp),%edx
-       movl    %esi,%eax
-       movl    %esi,%ebx
+       roll    $30,%esi
+       addl    %ebx,%r12d
+       xorl    0(%rsp),%edx
+       movl    %edi,%eax
+       movl    %r14d,60(%rsp)
+       movl    %edi,%ebx
        xorl    8(%rsp),%edx
-       andl    %edi,%eax
+       andl    %esi,%eax
        movl    %r12d,%ecx
        xorl    32(%rsp),%edx
-       xorl    %edi,%ebx
-       leal    -1894007588(%rbp,%r11,1),%r11d
+       leal    -1894007588(%r14,%r11,1),%r11d
+       xorl    %esi,%ebx
        roll    $5,%ecx
-       xorl    52(%rsp),%edx
        addl    %eax,%r11d
-       andl    %r13d,%ebx
        roll    $1,%edx
-       addl    %ebx,%r11d
+       andl    %r13d,%ebx
+       addl    %ecx,%r11d
        roll    $30,%r13d
+       addl    %ebx,%r11d
+       xorl    4(%rsp),%ebp
+       movl    %esi,%eax
        movl    %edx,0(%rsp)
-       addl    %ecx,%r11d
-       movl    4(%rsp),%ebp
-       movl    %r13d,%eax
-       movl    %r13d,%ebx
+       movl    %esi,%ebx
        xorl    12(%rsp),%ebp
-       andl    %esi,%eax
+       andl    %r13d,%eax
        movl    %r11d,%ecx
        xorl    36(%rsp),%ebp
-       xorl    %esi,%ebx
        leal    -1894007588(%rdx,%rdi,1),%edi
+       xorl    %r13d,%ebx
        roll    $5,%ecx
-       xorl    56(%rsp),%ebp
        addl    %eax,%edi
-       andl    %r12d,%ebx
        roll    $1,%ebp
-       addl    %ebx,%edi
+       andl    %r12d,%ebx
+       addl    %ecx,%edi
        roll    $30,%r12d
+       addl    %ebx,%edi
+       xorl    8(%rsp),%r14d
+       movl    %r13d,%eax
        movl    %ebp,4(%rsp)
-       addl    %ecx,%edi
-       movl    8(%rsp),%edx
-       movl    %r12d,%eax
-       movl    %r12d,%ebx
-       xorl    16(%rsp),%edx
-       andl    %r13d,%eax
+       movl    %r13d,%ebx
+       xorl    16(%rsp),%r14d
+       andl    %r12d,%eax
        movl    %edi,%ecx
-       xorl    40(%rsp),%edx
-       xorl    %r13d,%ebx
+       xorl    40(%rsp),%r14d
        leal    -1894007588(%rbp,%rsi,1),%esi
+       xorl    %r12d,%ebx
        roll    $5,%ecx
-       xorl    60(%rsp),%edx
        addl    %eax,%esi
+       roll    $1,%r14d
        andl    %r11d,%ebx
-       roll    $1,%edx
-       addl    %ebx,%esi
-       roll    $30,%r11d
-       movl    %edx,8(%rsp)
        addl    %ecx,%esi
-       movl    12(%rsp),%ebp
-       movl    %r11d,%eax
-       movl    %r11d,%ebx
-       xorl    20(%rsp),%ebp
-       andl    %r12d,%eax
+       roll    $30,%r11d
+       addl    %ebx,%esi
+       xorl    12(%rsp),%edx
+       movl    %r12d,%eax
+       movl    %r14d,8(%rsp)
+       movl    %r12d,%ebx
+       xorl    20(%rsp),%edx
+       andl    %r11d,%eax
        movl    %esi,%ecx
-       xorl    44(%rsp),%ebp
-       xorl    %r12d,%ebx
-       leal    -1894007588(%rdx,%r13,1),%r13d
+       xorl    44(%rsp),%edx
+       leal    -1894007588(%r14,%r13,1),%r13d
+       xorl    %r11d,%ebx
        roll    $5,%ecx
-       xorl    0(%rsp),%ebp
        addl    %eax,%r13d
+       roll    $1,%edx
        andl    %edi,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%r13d
-       roll    $30,%edi
-       movl    %ebp,12(%rsp)
        addl    %ecx,%r13d
-       movl    16(%rsp),%edx
-       movl    %edi,%eax
-       movl    %edi,%ebx
-       xorl    24(%rsp),%edx
-       andl    %r11d,%eax
+       roll    $30,%edi
+       addl    %ebx,%r13d
+       xorl    16(%rsp),%ebp
+       movl    %r11d,%eax
+       movl    %edx,12(%rsp)
+       movl    %r11d,%ebx
+       xorl    24(%rsp),%ebp
+       andl    %edi,%eax
        movl    %r13d,%ecx
-       xorl    48(%rsp),%edx
-       xorl    %r11d,%ebx
-       leal    -1894007588(%rbp,%r12,1),%r12d
+       xorl    48(%rsp),%ebp
+       leal    -1894007588(%rdx,%r12,1),%r12d
+       xorl    %edi,%ebx
        roll    $5,%ecx
-       xorl    4(%rsp),%edx
        addl    %eax,%r12d
+       roll    $1,%ebp
        andl    %esi,%ebx
-       roll    $1,%edx
-       addl    %ebx,%r12d
-       roll    $30,%esi
-       movl    %edx,16(%rsp)
        addl    %ecx,%r12d
-       movl    20(%rsp),%ebp
-       movl    %esi,%eax
-       movl    %esi,%ebx
-       xorl    28(%rsp),%ebp
-       andl    %edi,%eax
+       roll    $30,%esi
+       addl    %ebx,%r12d
+       xorl    20(%rsp),%r14d
+       movl    %edi,%eax
+       movl    %ebp,16(%rsp)
+       movl    %edi,%ebx
+       xorl    28(%rsp),%r14d
+       andl    %esi,%eax
        movl    %r12d,%ecx
-       xorl    52(%rsp),%ebp
-       xorl    %edi,%ebx
-       leal    -1894007588(%rdx,%r11,1),%r11d
+       xorl    52(%rsp),%r14d
+       leal    -1894007588(%rbp,%r11,1),%r11d
+       xorl    %esi,%ebx
        roll    $5,%ecx
-       xorl    8(%rsp),%ebp
        addl    %eax,%r11d
+       roll    $1,%r14d
        andl    %r13d,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%r11d
-       roll    $30,%r13d
-       movl    %ebp,20(%rsp)
        addl    %ecx,%r11d
-       movl    24(%rsp),%edx
-       movl    %r13d,%eax
-       movl    %r13d,%ebx
+       roll    $30,%r13d
+       addl    %ebx,%r11d
+       xorl    24(%rsp),%edx
+       movl    %esi,%eax
+       movl    %r14d,20(%rsp)
+       movl    %esi,%ebx
        xorl    32(%rsp),%edx
-       andl    %esi,%eax
+       andl    %r13d,%eax
        movl    %r11d,%ecx
        xorl    56(%rsp),%edx
-       xorl    %esi,%ebx
-       leal    -1894007588(%rbp,%rdi,1),%edi
+       leal    -1894007588(%r14,%rdi,1),%edi
+       xorl    %r13d,%ebx
        roll    $5,%ecx
-       xorl    12(%rsp),%edx
        addl    %eax,%edi
-       andl    %r12d,%ebx
        roll    $1,%edx
-       addl    %ebx,%edi
+       andl    %r12d,%ebx
+       addl    %ecx,%edi
        roll    $30,%r12d
+       addl    %ebx,%edi
+       xorl    28(%rsp),%ebp
+       movl    %r13d,%eax
        movl    %edx,24(%rsp)
-       addl    %ecx,%edi
-       movl    28(%rsp),%ebp
-       movl    %r12d,%eax
-       movl    %r12d,%ebx
+       movl    %r13d,%ebx
        xorl    36(%rsp),%ebp
-       andl    %r13d,%eax
+       andl    %r12d,%eax
        movl    %edi,%ecx
        xorl    60(%rsp),%ebp
-       xorl    %r13d,%ebx
        leal    -1894007588(%rdx,%rsi,1),%esi
+       xorl    %r12d,%ebx
        roll    $5,%ecx
-       xorl    16(%rsp),%ebp
        addl    %eax,%esi
-       andl    %r11d,%ebx
        roll    $1,%ebp
-       addl    %ebx,%esi
+       andl    %r11d,%ebx
+       addl    %ecx,%esi
        roll    $30,%r11d
+       addl    %ebx,%esi
+       xorl    32(%rsp),%r14d
+       movl    %r12d,%eax
        movl    %ebp,28(%rsp)
-       addl    %ecx,%esi
-       movl    32(%rsp),%edx
-       movl    %r11d,%eax
-       movl    %r11d,%ebx
-       xorl    40(%rsp),%edx
-       andl    %r12d,%eax
+       movl    %r12d,%ebx
+       xorl    40(%rsp),%r14d
+       andl    %r11d,%eax
        movl    %esi,%ecx
-       xorl    0(%rsp),%edx
-       xorl    %r12d,%ebx
+       xorl    0(%rsp),%r14d
        leal    -1894007588(%rbp,%r13,1),%r13d
+       xorl    %r11d,%ebx
        roll    $5,%ecx
-       xorl    20(%rsp),%edx
        addl    %eax,%r13d
+       roll    $1,%r14d
        andl    %edi,%ebx
-       roll    $1,%edx
-       addl    %ebx,%r13d
-       roll    $30,%edi
-       movl    %edx,32(%rsp)
        addl    %ecx,%r13d
-       movl    36(%rsp),%ebp
-       movl    %edi,%eax
-       movl    %edi,%ebx
-       xorl    44(%rsp),%ebp
-       andl    %r11d,%eax
+       roll    $30,%edi
+       addl    %ebx,%r13d
+       xorl    36(%rsp),%edx
+       movl    %r11d,%eax
+       movl    %r14d,32(%rsp)
+       movl    %r11d,%ebx
+       xorl    44(%rsp),%edx
+       andl    %edi,%eax
        movl    %r13d,%ecx
-       xorl    4(%rsp),%ebp
-       xorl    %r11d,%ebx
-       leal    -1894007588(%rdx,%r12,1),%r12d
-       roll    $5,%ecx
-       xorl    24(%rsp),%ebp
+       xorl    4(%rsp),%edx
+       leal    -1894007588(%r14,%r12,1),%r12d
+       xorl    %edi,%ebx
+       roll    $5,%ecx
        addl    %eax,%r12d
+       roll    $1,%edx
        andl    %esi,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%r12d
-       roll    $30,%esi
-       movl    %ebp,36(%rsp)
        addl    %ecx,%r12d
-       movl    40(%rsp),%edx
-       movl    %esi,%eax
-       movl    %esi,%ebx
-       xorl    48(%rsp),%edx
-       andl    %edi,%eax
+       roll    $30,%esi
+       addl    %ebx,%r12d
+       xorl    40(%rsp),%ebp
+       movl    %edi,%eax
+       movl    %edx,36(%rsp)
+       movl    %edi,%ebx
+       xorl    48(%rsp),%ebp
+       andl    %esi,%eax
        movl    %r12d,%ecx
-       xorl    8(%rsp),%edx
-       xorl    %edi,%ebx
-       leal    -1894007588(%rbp,%r11,1),%r11d
+       xorl    8(%rsp),%ebp
+       leal    -1894007588(%rdx,%r11,1),%r11d
+       xorl    %esi,%ebx
        roll    $5,%ecx
-       xorl    28(%rsp),%edx
        addl    %eax,%r11d
+       roll    $1,%ebp
        andl    %r13d,%ebx
-       roll    $1,%edx
-       addl    %ebx,%r11d
-       roll    $30,%r13d
-       movl    %edx,40(%rsp)
        addl    %ecx,%r11d
-       movl    44(%rsp),%ebp
-       movl    %r13d,%eax
-       movl    %r13d,%ebx
-       xorl    52(%rsp),%ebp
-       andl    %esi,%eax
+       roll    $30,%r13d
+       addl    %ebx,%r11d
+       xorl    44(%rsp),%r14d
+       movl    %esi,%eax
+       movl    %ebp,40(%rsp)
+       movl    %esi,%ebx
+       xorl    52(%rsp),%r14d
+       andl    %r13d,%eax
        movl    %r11d,%ecx
-       xorl    12(%rsp),%ebp
-       xorl    %esi,%ebx
-       leal    -1894007588(%rdx,%rdi,1),%edi
+       xorl    12(%rsp),%r14d
+       leal    -1894007588(%rbp,%rdi,1),%edi
+       xorl    %r13d,%ebx
        roll    $5,%ecx
-       xorl    32(%rsp),%ebp
        addl    %eax,%edi
+       roll    $1,%r14d
        andl    %r12d,%ebx
-       roll    $1,%ebp
-       addl    %ebx,%edi
-       roll    $30,%r12d
-       movl    %ebp,44(%rsp)
        addl    %ecx,%edi
-       movl    48(%rsp),%edx
-       movl    %r12d,%eax
-       movl    %r12d,%ebx
+       roll    $30,%r12d
+       addl    %ebx,%edi
+       xorl    48(%rsp),%edx
+       movl    %r13d,%eax
+       movl    %r14d,44(%rsp)
+       movl    %r13d,%ebx
        xorl    56(%rsp),%edx
-       andl    %r13d,%eax
+       andl    %r12d,%eax
        movl    %edi,%ecx
        xorl    16(%rsp),%edx
-       xorl    %r13d,%ebx
-       leal    -1894007588(%rbp,%rsi,1),%esi
+       leal    -1894007588(%r14,%rsi,1),%esi
+       xorl    %r12d,%ebx
        roll    $5,%ecx
-       xorl    36(%rsp),%edx
        addl    %eax,%esi
-       andl    %r11d,%ebx
        roll    $1,%edx
-       addl    %ebx,%esi
+       andl    %r11d,%ebx
+       addl    %ecx,%esi
        roll    $30,%r11d
+       addl    %ebx,%esi
+       xorl    52(%rsp),%ebp
+       movl    %edi,%eax
        movl    %edx,48(%rsp)
-       addl    %ecx,%esi
-       movl    52(%rsp),%ebp
-       movl    %r11d,%eax
        movl    %esi,%ecx
        xorl    60(%rsp),%ebp
-       xorl    %edi,%eax
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    -899497514(%rdx,%r13,1),%r13d
        xorl    20(%rsp),%ebp
-       xorl    %r12d,%eax
+       leal    -899497514(%rdx,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    40(%rsp),%ebp
        roll    $30,%edi
        addl    %eax,%r13d
        roll    $1,%ebp
+       xorl    56(%rsp),%r14d
+       movl    %esi,%eax
        movl    %ebp,52(%rsp)
-       movl    56(%rsp),%edx
-       movl    %edi,%eax
        movl    %r13d,%ecx
-       xorl    0(%rsp),%edx
-       xorl    %esi,%eax
+       xorl    0(%rsp),%r14d
+       xorl    %r11d,%eax
        roll    $5,%ecx
+       xorl    24(%rsp),%r14d
        leal    -899497514(%rbp,%r12,1),%r12d
-       xorl    24(%rsp),%edx
-       xorl    %r11d,%eax
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    44(%rsp),%edx
        roll    $30,%esi
        addl    %eax,%r12d
-       roll    $1,%edx
-       movl    %edx,56(%rsp)
-       movl    60(%rsp),%ebp
-       movl    %esi,%eax
+       roll    $1,%r14d
+       xorl    60(%rsp),%edx
+       movl    %r13d,%eax
+       movl    %r14d,56(%rsp)
        movl    %r12d,%ecx
-       xorl    4(%rsp),%ebp
-       xorl    %r13d,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rdx,%r11,1),%r11d
-       xorl    28(%rsp),%ebp
+       xorl    4(%rsp),%edx
        xorl    %edi,%eax
+       roll    $5,%ecx
+       xorl    28(%rsp),%edx
+       leal    -899497514(%r14,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    48(%rsp),%ebp
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%ebp
-       movl    %ebp,60(%rsp)
-       movl    0(%rsp),%edx
-       movl    %r13d,%eax
+       roll    $1,%edx
+       xorl    0(%rsp),%ebp
+       movl    %r12d,%eax
+       movl    %edx,60(%rsp)
        movl    %r11d,%ecx
-       xorl    8(%rsp),%edx
-       xorl    %r12d,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rbp,%rdi,1),%edi
-       xorl    32(%rsp),%edx
+       xorl    8(%rsp),%ebp
        xorl    %esi,%eax
+       roll    $5,%ecx
+       xorl    32(%rsp),%ebp
+       leal    -899497514(%rdx,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    52(%rsp),%edx
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%edx
-       movl    %edx,0(%rsp)
-       movl    4(%rsp),%ebp
-       movl    %r12d,%eax
+       roll    $1,%ebp
+       xorl    4(%rsp),%r14d
+       movl    %r11d,%eax
+       movl    %ebp,0(%rsp)
        movl    %edi,%ecx
-       xorl    12(%rsp),%ebp
-       xorl    %r11d,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rdx,%rsi,1),%esi
-       xorl    36(%rsp),%ebp
+       xorl    12(%rsp),%r14d
        xorl    %r13d,%eax
+       roll    $5,%ecx
+       xorl    36(%rsp),%r14d
+       leal    -899497514(%rbp,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    56(%rsp),%ebp
        roll    $30,%r11d
        addl    %eax,%esi
-       roll    $1,%ebp
-       movl    %ebp,4(%rsp)
-       movl    8(%rsp),%edx
-       movl    %r11d,%eax
+       roll    $1,%r14d
+       xorl    8(%rsp),%edx
+       movl    %edi,%eax
+       movl    %r14d,4(%rsp)
        movl    %esi,%ecx
        xorl    16(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    %r12d,%eax
        roll    $5,%ecx
-       leal    -899497514(%rbp,%r13,1),%r13d
        xorl    40(%rsp),%edx
-       xorl    %r12d,%eax
+       leal    -899497514(%r14,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    60(%rsp),%edx
        roll    $30,%edi
        addl    %eax,%r13d
        roll    $1,%edx
+       xorl    12(%rsp),%ebp
+       movl    %esi,%eax
        movl    %edx,8(%rsp)
-       movl    12(%rsp),%ebp
-       movl    %edi,%eax
        movl    %r13d,%ecx
        xorl    20(%rsp),%ebp
-       xorl    %esi,%eax
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    -899497514(%rdx,%r12,1),%r12d
        xorl    44(%rsp),%ebp
-       xorl    %r11d,%eax
+       leal    -899497514(%rdx,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    0(%rsp),%ebp
        roll    $30,%esi
        addl    %eax,%r12d
        roll    $1,%ebp
+       xorl    16(%rsp),%r14d
+       movl    %r13d,%eax
        movl    %ebp,12(%rsp)
-       movl    16(%rsp),%edx
-       movl    %esi,%eax
        movl    %r12d,%ecx
-       xorl    24(%rsp),%edx
-       xorl    %r13d,%eax
+       xorl    24(%rsp),%r14d
+       xorl    %edi,%eax
        roll    $5,%ecx
+       xorl    48(%rsp),%r14d
        leal    -899497514(%rbp,%r11,1),%r11d
-       xorl    48(%rsp),%edx
-       xorl    %edi,%eax
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    4(%rsp),%edx
        roll    $30,%r13d
        addl    %eax,%r11d
-       roll    $1,%edx
-       movl    %edx,16(%rsp)
-       movl    20(%rsp),%ebp
-       movl    %r13d,%eax
+       roll    $1,%r14d
+       xorl    20(%rsp),%edx
+       movl    %r12d,%eax
+       movl    %r14d,16(%rsp)
        movl    %r11d,%ecx
-       xorl    28(%rsp),%ebp
-       xorl    %r12d,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rdx,%rdi,1),%edi
-       xorl    52(%rsp),%ebp
+       xorl    28(%rsp),%edx
        xorl    %esi,%eax
+       roll    $5,%ecx
+       xorl    52(%rsp),%edx
+       leal    -899497514(%r14,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    8(%rsp),%ebp
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%ebp
-       movl    %ebp,20(%rsp)
-       movl    24(%rsp),%edx
-       movl    %r12d,%eax
+       roll    $1,%edx
+       xorl    24(%rsp),%ebp
+       movl    %r11d,%eax
+       movl    %edx,20(%rsp)
        movl    %edi,%ecx
-       xorl    32(%rsp),%edx
-       xorl    %r11d,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rbp,%rsi,1),%esi
-       xorl    56(%rsp),%edx
+       xorl    32(%rsp),%ebp
        xorl    %r13d,%eax
+       roll    $5,%ecx
+       xorl    56(%rsp),%ebp
+       leal    -899497514(%rdx,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    12(%rsp),%edx
        roll    $30,%r11d
        addl    %eax,%esi
-       roll    $1,%edx
-       movl    %edx,24(%rsp)
-       movl    28(%rsp),%ebp
-       movl    %r11d,%eax
+       roll    $1,%ebp
+       xorl    28(%rsp),%r14d
+       movl    %edi,%eax
+       movl    %ebp,24(%rsp)
        movl    %esi,%ecx
-       xorl    36(%rsp),%ebp
-       xorl    %edi,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rdx,%r13,1),%r13d
-       xorl    60(%rsp),%ebp
+       xorl    36(%rsp),%r14d
        xorl    %r12d,%eax
+       roll    $5,%ecx
+       xorl    60(%rsp),%r14d
+       leal    -899497514(%rbp,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    16(%rsp),%ebp
        roll    $30,%edi
        addl    %eax,%r13d
-       roll    $1,%ebp
-       movl    %ebp,28(%rsp)
-       movl    32(%rsp),%edx
-       movl    %edi,%eax
+       roll    $1,%r14d
+       xorl    32(%rsp),%edx
+       movl    %esi,%eax
+       movl    %r14d,28(%rsp)
        movl    %r13d,%ecx
        xorl    40(%rsp),%edx
-       xorl    %esi,%eax
+       xorl    %r11d,%eax
        roll    $5,%ecx
-       leal    -899497514(%rbp,%r12,1),%r12d
        xorl    0(%rsp),%edx
-       xorl    %r11d,%eax
+       leal    -899497514(%r14,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    20(%rsp),%edx
        roll    $30,%esi
        addl    %eax,%r12d
        roll    $1,%edx
-       movl    %edx,32(%rsp)
-       movl    36(%rsp),%ebp
-       movl    %esi,%eax
+       xorl    36(%rsp),%ebp
+       movl    %r13d,%eax
+
        movl    %r12d,%ecx
        xorl    44(%rsp),%ebp
-       xorl    %r13d,%eax
+       xorl    %edi,%eax
        roll    $5,%ecx
-       leal    -899497514(%rdx,%r11,1),%r11d
        xorl    4(%rsp),%ebp
-       xorl    %edi,%eax
+       leal    -899497514(%rdx,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    24(%rsp),%ebp
        roll    $30,%r13d
        addl    %eax,%r11d
        roll    $1,%ebp
-       movl    %ebp,36(%rsp)
-       movl    40(%rsp),%edx
-       movl    %r13d,%eax
+       xorl    40(%rsp),%r14d
+       movl    %r12d,%eax
+
        movl    %r11d,%ecx
-       xorl    48(%rsp),%edx
-       xorl    %r12d,%eax
+       xorl    48(%rsp),%r14d
+       xorl    %esi,%eax
        roll    $5,%ecx
+       xorl    8(%rsp),%r14d
        leal    -899497514(%rbp,%rdi,1),%edi
-       xorl    8(%rsp),%edx
-       xorl    %esi,%eax
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    28(%rsp),%edx
        roll    $30,%r12d
        addl    %eax,%edi
-       roll    $1,%edx
-       movl    %edx,40(%rsp)
-       movl    44(%rsp),%ebp
-       movl    %r12d,%eax
+       roll    $1,%r14d
+       xorl    44(%rsp),%edx
+       movl    %r11d,%eax
+
        movl    %edi,%ecx
-       xorl    52(%rsp),%ebp
-       xorl    %r11d,%eax
+       xorl    52(%rsp),%edx
+       xorl    %r13d,%eax
        roll    $5,%ecx
-       leal    -899497514(%rdx,%rsi,1),%esi
-       xorl    12(%rsp),%ebp
-       xorl    %r13d,%eax
+       xorl    12(%rsp),%edx
+       leal    -899497514(%r14,%rsi,1),%esi
+       xorl    %r12d,%eax
        addl    %ecx,%esi
-       xorl    32(%rsp),%ebp
        roll    $30,%r11d
        addl    %eax,%esi
-       roll    $1,%ebp
-       movl    %ebp,44(%rsp)
-       movl    48(%rsp),%edx
-       movl    %r11d,%eax
+       roll    $1,%edx
+       xorl    48(%rsp),%ebp
+       movl    %edi,%eax
+
        movl    %esi,%ecx
-       xorl    56(%rsp),%edx
-       xorl    %edi,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rbp,%r13,1),%r13d
-       xorl    16(%rsp),%edx
+       xorl    56(%rsp),%ebp
        xorl    %r12d,%eax
+       roll    $5,%ecx
+       xorl    16(%rsp),%ebp
+       leal    -899497514(%rdx,%r13,1),%r13d
+       xorl    %r11d,%eax
        addl    %ecx,%r13d
-       xorl    36(%rsp),%edx
        roll    $30,%edi
        addl    %eax,%r13d
-       roll    $1,%edx
-       movl    %edx,48(%rsp)
-       movl    52(%rsp),%ebp
-       movl    %edi,%eax
+       roll    $1,%ebp
+       xorl    52(%rsp),%r14d
+       movl    %esi,%eax
+
        movl    %r13d,%ecx
-       xorl    60(%rsp),%ebp
-       xorl    %esi,%eax
-       roll    $5,%ecx
-       leal    -899497514(%rdx,%r12,1),%r12d
-       xorl    20(%rsp),%ebp
+       xorl    60(%rsp),%r14d
        xorl    %r11d,%eax
+       roll    $5,%ecx
+       xorl    20(%rsp),%r14d
+       leal    -899497514(%rbp,%r12,1),%r12d
+       xorl    %edi,%eax
        addl    %ecx,%r12d
-       xorl    40(%rsp),%ebp
        roll    $30,%esi
        addl    %eax,%r12d
-       roll    $1,%ebp
-       movl    56(%rsp),%edx
-       movl    %esi,%eax
+       roll    $1,%r14d
+       xorl    56(%rsp),%edx
+       movl    %r13d,%eax
+
        movl    %r12d,%ecx
        xorl    0(%rsp),%edx
-       xorl    %r13d,%eax
+       xorl    %edi,%eax
        roll    $5,%ecx
-       leal    -899497514(%rbp,%r11,1),%r11d
        xorl    24(%rsp),%edx
-       xorl    %edi,%eax
+       leal    -899497514(%r14,%r11,1),%r11d
+       xorl    %esi,%eax
        addl    %ecx,%r11d
-       xorl    44(%rsp),%edx
        roll    $30,%r13d
        addl    %eax,%r11d
        roll    $1,%edx
-       movl    60(%rsp),%ebp
-       movl    %r13d,%eax
+       xorl    60(%rsp),%ebp
+       movl    %r12d,%eax
+
        movl    %r11d,%ecx
        xorl    4(%rsp),%ebp
-       xorl    %r12d,%eax
+       xorl    %esi,%eax
        roll    $5,%ecx
-       leal    -899497514(%rdx,%rdi,1),%edi
        xorl    28(%rsp),%ebp
-       xorl    %esi,%eax
+       leal    -899497514(%rdx,%rdi,1),%edi
+       xorl    %r13d,%eax
        addl    %ecx,%edi
-       xorl    48(%rsp),%ebp
        roll    $30,%r12d
        addl    %eax,%edi
        roll    $1,%ebp
-       movl    %r12d,%eax
+       movl    %r11d,%eax
        movl    %edi,%ecx
-       xorl    %r11d,%eax
+       xorl    %r13d,%eax
        leal    -899497514(%rbp,%rsi,1),%esi
        roll    $5,%ecx
-       xorl    %r13d,%eax
+       xorl    %r12d,%eax
        addl    %ecx,%esi
        roll    $30,%r11d
        addl    %eax,%esi
@@ -1319,29 +1277,218 @@ L$loop:
        jnz     L$loop
 
        movq    64(%rsp),%rsi
-       movq    (%rsi),%r13
-       movq    8(%rsi),%r12
-       movq    16(%rsi),%rbp
-       movq    24(%rsi),%rbx
-       leaq    32(%rsi),%rsp
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
 L$epilogue:
        .byte   0xf3,0xc3
 
 
+
+.p2align       5
+sha1_block_data_order_shaext:
+_shaext_shortcut:
+
+       movdqu  (%rdi),%xmm0
+       movd    16(%rdi),%xmm1
+       movdqa  K_XX_XX+160(%rip),%xmm3
+
+       movdqu  (%rsi),%xmm4
+       pshufd  $27,%xmm0,%xmm0
+       movdqu  16(%rsi),%xmm5
+       pshufd  $27,%xmm1,%xmm1
+       movdqu  32(%rsi),%xmm6
+.byte  102,15,56,0,227
+       movdqu  48(%rsi),%xmm7
+.byte  102,15,56,0,235
+.byte  102,15,56,0,243
+       movdqa  %xmm1,%xmm9
+.byte  102,15,56,0,251
+       jmp     L$oop_shaext
+
+.p2align       4
+L$oop_shaext:
+       decq    %rdx
+       leaq    64(%rsi),%r8
+       paddd   %xmm4,%xmm1
+       cmovneq %r8,%rsi
+       movdqa  %xmm0,%xmm8
+.byte  15,56,201,229
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,0
+.byte  15,56,200,213
+       pxor    %xmm6,%xmm4
+.byte  15,56,201,238
+.byte  15,56,202,231
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,0
+.byte  15,56,200,206
+       pxor    %xmm7,%xmm5
+.byte  15,56,202,236
+.byte  15,56,201,247
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,0
+.byte  15,56,200,215
+       pxor    %xmm4,%xmm6
+.byte  15,56,201,252
+.byte  15,56,202,245
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,0
+.byte  15,56,200,204
+       pxor    %xmm5,%xmm7
+.byte  15,56,202,254
+.byte  15,56,201,229
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,0
+.byte  15,56,200,213
+       pxor    %xmm6,%xmm4
+.byte  15,56,201,238
+.byte  15,56,202,231
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,1
+.byte  15,56,200,206
+       pxor    %xmm7,%xmm5
+.byte  15,56,202,236
+.byte  15,56,201,247
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,1
+.byte  15,56,200,215
+       pxor    %xmm4,%xmm6
+.byte  15,56,201,252
+.byte  15,56,202,245
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,1
+.byte  15,56,200,204
+       pxor    %xmm5,%xmm7
+.byte  15,56,202,254
+.byte  15,56,201,229
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,1
+.byte  15,56,200,213
+       pxor    %xmm6,%xmm4
+.byte  15,56,201,238
+.byte  15,56,202,231
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,1
+.byte  15,56,200,206
+       pxor    %xmm7,%xmm5
+.byte  15,56,202,236
+.byte  15,56,201,247
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,2
+.byte  15,56,200,215
+       pxor    %xmm4,%xmm6
+.byte  15,56,201,252
+.byte  15,56,202,245
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,2
+.byte  15,56,200,204
+       pxor    %xmm5,%xmm7
+.byte  15,56,202,254
+.byte  15,56,201,229
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,2
+.byte  15,56,200,213
+       pxor    %xmm6,%xmm4
+.byte  15,56,201,238
+.byte  15,56,202,231
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,2
+.byte  15,56,200,206
+       pxor    %xmm7,%xmm5
+.byte  15,56,202,236
+.byte  15,56,201,247
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,2
+.byte  15,56,200,215
+       pxor    %xmm4,%xmm6
+.byte  15,56,201,252
+.byte  15,56,202,245
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,3
+.byte  15,56,200,204
+       pxor    %xmm5,%xmm7
+.byte  15,56,202,254
+       movdqu  (%rsi),%xmm4
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,3
+.byte  15,56,200,213
+       movdqu  16(%rsi),%xmm5
+.byte  102,15,56,0,227
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,3
+.byte  15,56,200,206
+       movdqu  32(%rsi),%xmm6
+.byte  102,15,56,0,235
+
+       movdqa  %xmm0,%xmm2
+.byte  15,58,204,193,3
+.byte  15,56,200,215
+       movdqu  48(%rsi),%xmm7
+.byte  102,15,56,0,243
+
+       movdqa  %xmm0,%xmm1
+.byte  15,58,204,194,3
+.byte  65,15,56,200,201
+.byte  102,15,56,0,251
+
+       paddd   %xmm8,%xmm0
+       movdqa  %xmm1,%xmm9
+
+       jnz     L$oop_shaext
+
+       pshufd  $27,%xmm0,%xmm0
+       pshufd  $27,%xmm1,%xmm1
+       movdqu  %xmm0,(%rdi)
+       movd    %xmm1,16(%rdi)
+
+       .byte   0xf3,0xc3
+
+
 .p2align       4
 sha1_block_data_order_ssse3:
 _ssse3_shortcut:
+
+       movq    %rsp,%r11
+
        pushq   %rbx
+
        pushq   %rbp
+
        pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
        leaq    -64(%rsp),%rsp
+       andq    $-64,%rsp
        movq    %rdi,%r8
        movq    %rsi,%r9
        movq    %rdx,%r10
 
        shlq    $6,%r10
        addq    %r9,%r10
-       leaq    K_XX_XX(%rip),%r11
+       leaq    K_XX_XX+64(%rip),%r14
 
        movl    0(%r8),%eax
        movl    4(%r8),%ebx
@@ -1353,18 +1500,18 @@ _ssse3_shortcut:
        xorl    %edx,%edi
        andl    %edi,%esi
 
-       movdqa  64(%r11),%xmm6
-       movdqa  0(%r11),%xmm9
+       movdqa  64(%r14),%xmm6
+       movdqa  -64(%r14),%xmm9
        movdqu  0(%r9),%xmm0
        movdqu  16(%r9),%xmm1
        movdqu  32(%r9),%xmm2
        movdqu  48(%r9),%xmm3
 .byte  102,15,56,0,198
-       addq    $64,%r9
 .byte  102,15,56,0,206
 .byte  102,15,56,0,214
-.byte  102,15,56,0,222
+       addq    $64,%r9
        paddd   %xmm9,%xmm0
+.byte  102,15,56,0,222
        paddd   %xmm9,%xmm1
        paddd   %xmm9,%xmm2
        movdqa  %xmm0,0(%rsp)
@@ -1376,24 +1523,24 @@ _ssse3_shortcut:
        jmp     L$oop_ssse3
 .p2align       4
 L$oop_ssse3:
-       movdqa  %xmm1,%xmm4
        rorl    $2,%ebx
+       pshufd  $238,%xmm0,%xmm4
        xorl    %edx,%esi
        movdqa  %xmm3,%xmm8
-.byte  102,15,58,15,224,8
+       paddd   %xmm3,%xmm9
        movl    %eax,%edi
        addl    0(%rsp),%ebp
-       paddd   %xmm3,%xmm9
+       punpcklqdq      %xmm1,%xmm4
        xorl    %ecx,%ebx
        roll    $5,%eax
-       psrldq  $4,%xmm8
        addl    %esi,%ebp
+       psrldq  $4,%xmm8
        andl    %ebx,%edi
-       pxor    %xmm0,%xmm4
        xorl    %ecx,%ebx
+       pxor    %xmm0,%xmm4
        addl    %eax,%ebp
-       pxor    %xmm2,%xmm8
        rorl    $7,%eax
+       pxor    %xmm2,%xmm8
        xorl    %ecx,%edi
        movl    %ebp,%esi
        addl    4(%rsp),%edx
@@ -1404,57 +1551,57 @@ L$oop_ssse3:
        addl    %edi,%edx
        andl    %eax,%esi
        movdqa  %xmm4,%xmm10
-       movdqa  %xmm4,%xmm8
        xorl    %ebx,%eax
        addl    %ebp,%edx
        rorl    $7,%ebp
+       movdqa  %xmm4,%xmm8
        xorl    %ebx,%esi
        pslldq  $12,%xmm10
        paddd   %xmm4,%xmm4
        movl    %edx,%edi
        addl    8(%rsp),%ecx
+       psrld   $31,%xmm8
        xorl    %eax,%ebp
        roll    $5,%edx
-       psrld   $31,%xmm8
        addl    %esi,%ecx
-       andl    %ebp,%edi
        movdqa  %xmm10,%xmm9
+       andl    %ebp,%edi
        xorl    %eax,%ebp
-       addl    %edx,%ecx
        psrld   $30,%xmm10
-       por     %xmm8,%xmm4
+       addl    %edx,%ecx
        rorl    $7,%edx
+       por     %xmm8,%xmm4
        xorl    %eax,%edi
        movl    %ecx,%esi
        addl    12(%rsp),%ebx
        pslld   $2,%xmm9
        pxor    %xmm10,%xmm4
        xorl    %ebp,%edx
+       movdqa  -64(%r14),%xmm10
        roll    $5,%ecx
-       movdqa  0(%r11),%xmm10
        addl    %edi,%ebx
        andl    %edx,%esi
        pxor    %xmm9,%xmm4
        xorl    %ebp,%edx
        addl    %ecx,%ebx
-       movdqa  %xmm2,%xmm5
        rorl    $7,%ecx
+       pshufd  $238,%xmm1,%xmm5
        xorl    %ebp,%esi
        movdqa  %xmm4,%xmm9
-.byte  102,15,58,15,233,8
+       paddd   %xmm4,%xmm10
        movl    %ebx,%edi
        addl    16(%rsp),%eax
-       paddd   %xmm4,%xmm10
+       punpcklqdq      %xmm2,%xmm5
        xorl    %edx,%ecx
        roll    $5,%ebx
-       psrldq  $4,%xmm9
        addl    %esi,%eax
+       psrldq  $4,%xmm9
        andl    %ecx,%edi
-       pxor    %xmm1,%xmm5
        xorl    %edx,%ecx
+       pxor    %xmm1,%xmm5
        addl    %ebx,%eax
-       pxor    %xmm3,%xmm9
        rorl    $7,%ebx
+       pxor    %xmm3,%xmm9
        xorl    %edx,%edi
        movl    %eax,%esi
        addl    20(%rsp),%ebp
@@ -1465,57 +1612,57 @@ L$oop_ssse3:
        addl    %edi,%ebp
        andl    %ebx,%esi
        movdqa  %xmm5,%xmm8
-       movdqa  %xmm5,%xmm9
        xorl    %ecx,%ebx
        addl    %eax,%ebp
        rorl    $7,%eax
+       movdqa  %xmm5,%xmm9
        xorl    %ecx,%esi
        pslldq  $12,%xmm8
        paddd   %xmm5,%xmm5
        movl    %ebp,%edi
        addl    24(%rsp),%edx
+       psrld   $31,%xmm9
        xorl    %ebx,%eax
        roll    $5,%ebp
-       psrld   $31,%xmm9
        addl    %esi,%edx
-       andl    %eax,%edi
        movdqa  %xmm8,%xmm10
+       andl    %eax,%edi
        xorl    %ebx,%eax
-       addl    %ebp,%edx
        psrld   $30,%xmm8
-       por     %xmm9,%xmm5
+       addl    %ebp,%edx
        rorl    $7,%ebp
+       por     %xmm9,%xmm5
        xorl    %ebx,%edi
        movl    %edx,%esi
        addl    28(%rsp),%ecx
        pslld   $2,%xmm10
        pxor    %xmm8,%xmm5
        xorl    %eax,%ebp
+       movdqa  -32(%r14),%xmm8
        roll    $5,%edx
-       movdqa  16(%r11),%xmm8
        addl    %edi,%ecx
        andl    %ebp,%esi
        pxor    %xmm10,%xmm5
        xorl    %eax,%ebp
        addl    %edx,%ecx
-       movdqa  %xmm3,%xmm6
        rorl    $7,%edx
+       pshufd  $238,%xmm2,%xmm6
        xorl    %eax,%esi
        movdqa  %xmm5,%xmm10
-.byte  102,15,58,15,242,8
+       paddd   %xmm5,%xmm8
        movl    %ecx,%edi
        addl    32(%rsp),%ebx
-       paddd   %xmm5,%xmm8
+       punpcklqdq      %xmm3,%xmm6
        xorl    %ebp,%edx
        roll    $5,%ecx
-       psrldq  $4,%xmm10
        addl    %esi,%ebx
+       psrldq  $4,%xmm10
        andl    %edx,%edi
-       pxor    %xmm2,%xmm6
        xorl    %ebp,%edx
+       pxor    %xmm2,%xmm6
        addl    %ecx,%ebx
-       pxor    %xmm4,%xmm10
        rorl    $7,%ecx
+       pxor    %xmm4,%xmm10
        xorl    %ebp,%edi
        movl    %ebx,%esi
        addl    36(%rsp),%eax
@@ -1526,57 +1673,57 @@ L$oop_ssse3:
        addl    %edi,%eax
        andl    %ecx,%esi
        movdqa  %xmm6,%xmm9
-       movdqa  %xmm6,%xmm10
        xorl    %edx,%ecx
        addl    %ebx,%eax
        rorl    $7,%ebx
+       movdqa  %xmm6,%xmm10
        xorl    %edx,%esi
        pslldq  $12,%xmm9
        paddd   %xmm6,%xmm6
        movl    %eax,%edi
        addl    40(%rsp),%ebp
+       psrld   $31,%xmm10
        xorl    %ecx,%ebx
        roll    $5,%eax
-       psrld   $31,%xmm10
        addl    %esi,%ebp
-       andl    %ebx,%edi
        movdqa  %xmm9,%xmm8
+       andl    %ebx,%edi
        xorl    %ecx,%ebx
-       addl    %eax,%ebp
        psrld   $30,%xmm9
-       por     %xmm10,%xmm6
+       addl    %eax,%ebp
        rorl    $7,%eax
+       por     %xmm10,%xmm6
        xorl    %ecx,%edi
        movl    %ebp,%esi
        addl    44(%rsp),%edx
        pslld   $2,%xmm8
        pxor    %xmm9,%xmm6
        xorl    %ebx,%eax
+       movdqa  -32(%r14),%xmm9
        roll    $5,%ebp
-       movdqa  16(%r11),%xmm9
        addl    %edi,%edx
        andl    %eax,%esi
        pxor    %xmm8,%xmm6
        xorl    %ebx,%eax
        addl    %ebp,%edx
-       movdqa  %xmm4,%xmm7
        rorl    $7,%ebp
+       pshufd  $238,%xmm3,%xmm7
        xorl    %ebx,%esi
        movdqa  %xmm6,%xmm8
-.byte  102,15,58,15,251,8
+       paddd   %xmm6,%xmm9
        movl    %edx,%edi
        addl    48(%rsp),%ecx
-       paddd   %xmm6,%xmm9
+       punpcklqdq      %xmm4,%xmm7
        xorl    %eax,%ebp
        roll    $5,%edx
-       psrldq  $4,%xmm8
        addl    %esi,%ecx
+       psrldq  $4,%xmm8
        andl    %ebp,%edi
-       pxor    %xmm3,%xmm7
        xorl    %eax,%ebp
+       pxor    %xmm3,%xmm7
        addl    %edx,%ecx
-       pxor    %xmm5,%xmm8
        rorl    $7,%edx
+       pxor    %xmm5,%xmm8
        xorl    %eax,%edi
        movl    %ecx,%esi
        addl    52(%rsp),%ebx
@@ -1587,78 +1734,78 @@ L$oop_ssse3:
        addl    %edi,%ebx
        andl    %edx,%esi
        movdqa  %xmm7,%xmm10
-       movdqa  %xmm7,%xmm8
        xorl    %ebp,%edx
        addl    %ecx,%ebx
        rorl    $7,%ecx
+       movdqa  %xmm7,%xmm8
        xorl    %ebp,%esi
        pslldq  $12,%xmm10
        paddd   %xmm7,%xmm7
        movl    %ebx,%edi
        addl    56(%rsp),%eax
+       psrld   $31,%xmm8
        xorl    %edx,%ecx
        roll    $5,%ebx
-       psrld   $31,%xmm8
        addl    %esi,%eax
-       andl    %ecx,%edi
        movdqa  %xmm10,%xmm9
+       andl    %ecx,%edi
        xorl    %edx,%ecx
-       addl    %ebx,%eax
        psrld   $30,%xmm10
-       por     %xmm8,%xmm7
+       addl    %ebx,%eax
        rorl    $7,%ebx
+       por     %xmm8,%xmm7
        xorl    %edx,%edi
        movl    %eax,%esi
        addl    60(%rsp),%ebp
        pslld   $2,%xmm9
        pxor    %xmm10,%xmm7
        xorl    %ecx,%ebx
+       movdqa  -32(%r14),%xmm10
        roll    $5,%eax
-       movdqa  16(%r11),%xmm10
        addl    %edi,%ebp
        andl    %ebx,%esi
        pxor    %xmm9,%xmm7
+       pshufd  $238,%xmm6,%xmm9
        xorl    %ecx,%ebx
        addl    %eax,%ebp
-       movdqa  %xmm7,%xmm9
        rorl    $7,%eax
        pxor    %xmm4,%xmm0
-.byte  102,68,15,58,15,206,8
        xorl    %ecx,%esi
        movl    %ebp,%edi
        addl    0(%rsp),%edx
-       pxor    %xmm1,%xmm0
+       punpcklqdq      %xmm7,%xmm9
        xorl    %ebx,%eax
        roll    $5,%ebp
-       movdqa  %xmm10,%xmm8
-       paddd   %xmm7,%xmm10
+       pxor    %xmm1,%xmm0
        addl    %esi,%edx
        andl    %eax,%edi
-       pxor    %xmm9,%xmm0
+       movdqa  %xmm10,%xmm8
        xorl    %ebx,%eax
+       paddd   %xmm7,%xmm10
        addl    %ebp,%edx
+       pxor    %xmm9,%xmm0
        rorl    $7,%ebp
        xorl    %ebx,%edi
-       movdqa  %xmm0,%xmm9
-       movdqa  %xmm10,48(%rsp)
        movl    %edx,%esi
        addl    4(%rsp),%ecx
+       movdqa  %xmm0,%xmm9
        xorl    %eax,%ebp
        roll    $5,%edx
-       pslld   $2,%xmm0
+       movdqa  %xmm10,48(%rsp)
        addl    %edi,%ecx
        andl    %ebp,%esi
-       psrld   $30,%xmm9
        xorl    %eax,%ebp
+       pslld   $2,%xmm0
        addl    %edx,%ecx
        rorl    $7,%edx
+       psrld   $30,%xmm9
        xorl    %eax,%esi
        movl    %ecx,%edi
        addl    8(%rsp),%ebx
        por     %xmm9,%xmm0
        xorl    %ebp,%edx
        roll    $5,%ecx
-       movdqa  %xmm0,%xmm10
+       pshufd  $238,%xmm7,%xmm10
        addl    %esi,%ebx
        andl    %edx,%edi
        xorl    %ebp,%edx
@@ -1671,18 +1818,18 @@ L$oop_ssse3:
        xorl    %edx,%esi
        rorl    $7,%ecx
        addl    %ebx,%eax
-       addl    16(%rsp),%ebp
        pxor    %xmm5,%xmm1
-.byte  102,68,15,58,15,215,8
+       addl    16(%rsp),%ebp
        xorl    %ecx,%esi
+       punpcklqdq      %xmm0,%xmm10
        movl    %eax,%edi
        roll    $5,%eax
        pxor    %xmm2,%xmm1
        addl    %esi,%ebp
        xorl    %ecx,%edi
        movdqa  %xmm8,%xmm9
-       paddd   %xmm0,%xmm8
        rorl    $7,%ebx
+       paddd   %xmm0,%xmm8
        addl    %eax,%ebp
        pxor    %xmm10,%xmm1
        addl    20(%rsp),%edx
@@ -1690,43 +1837,43 @@ L$oop_ssse3:
        movl    %ebp,%esi
        roll    $5,%ebp
        movdqa  %xmm1,%xmm10
-       movdqa  %xmm8,0(%rsp)
        addl    %edi,%edx
        xorl    %ebx,%esi
+       movdqa  %xmm8,0(%rsp)
        rorl    $7,%eax
        addl    %ebp,%edx
-       pslld   $2,%xmm1
        addl    24(%rsp),%ecx
+       pslld   $2,%xmm1
        xorl    %eax,%esi
-       psrld   $30,%xmm10
        movl    %edx,%edi
+       psrld   $30,%xmm10
        roll    $5,%edx
        addl    %esi,%ecx
        xorl    %eax,%edi
        rorl    $7,%ebp
-       addl    %edx,%ecx
        por     %xmm10,%xmm1
+       addl    %edx,%ecx
        addl    28(%rsp),%ebx
+       pshufd  $238,%xmm0,%xmm8
        xorl    %ebp,%edi
-       movdqa  %xmm1,%xmm8
        movl    %ecx,%esi
        roll    $5,%ecx
        addl    %edi,%ebx
        xorl    %ebp,%esi
        rorl    $7,%edx
        addl    %ecx,%ebx
-       addl    32(%rsp),%eax
        pxor    %xmm6,%xmm2
-.byte  102,68,15,58,15,192,8
+       addl    32(%rsp),%eax
        xorl    %edx,%esi
+       punpcklqdq      %xmm1,%xmm8
        movl    %ebx,%edi
        roll    $5,%ebx
        pxor    %xmm3,%xmm2
        addl    %esi,%eax
        xorl    %edx,%edi
-       movdqa  32(%r11),%xmm10
-       paddd   %xmm1,%xmm9
+       movdqa  0(%r14),%xmm10
        rorl    $7,%ecx
+       paddd   %xmm1,%xmm9
        addl    %ebx,%eax
        pxor    %xmm8,%xmm2
        addl    36(%rsp),%ebp
@@ -1734,43 +1881,43 @@ L$oop_ssse3:
        movl    %eax,%esi
        roll    $5,%eax
        movdqa  %xmm2,%xmm8
-       movdqa  %xmm9,16(%rsp)
        addl    %edi,%ebp
        xorl    %ecx,%esi
+       movdqa  %xmm9,16(%rsp)
        rorl    $7,%ebx
        addl    %eax,%ebp
-       pslld   $2,%xmm2
        addl    40(%rsp),%edx
+       pslld   $2,%xmm2
        xorl    %ebx,%esi
-       psrld   $30,%xmm8
        movl    %ebp,%edi
+       psrld   $30,%xmm8
        roll    $5,%ebp
        addl    %esi,%edx
        xorl    %ebx,%edi
        rorl    $7,%eax
-       addl    %ebp,%edx
        por     %xmm8,%xmm2
+       addl    %ebp,%edx
        addl    44(%rsp),%ecx
+       pshufd  $238,%xmm1,%xmm9
        xorl    %eax,%edi
-       movdqa  %xmm2,%xmm9
        movl    %edx,%esi
        roll    $5,%edx
        addl    %edi,%ecx
        xorl    %eax,%esi
        rorl    $7,%ebp
        addl    %edx,%ecx
-       addl    48(%rsp),%ebx
        pxor    %xmm7,%xmm3
-.byte  102,68,15,58,15,201,8
+       addl    48(%rsp),%ebx
        xorl    %ebp,%esi
+       punpcklqdq      %xmm2,%xmm9
        movl    %ecx,%edi
        roll    $5,%ecx
        pxor    %xmm4,%xmm3
        addl    %esi,%ebx
        xorl    %ebp,%edi
        movdqa  %xmm10,%xmm8
-       paddd   %xmm2,%xmm10
        rorl    $7,%edx
+       paddd   %xmm2,%xmm10
        addl    %ecx,%ebx
        pxor    %xmm9,%xmm3
        addl    52(%rsp),%eax
@@ -1778,43 +1925,43 @@ L$oop_ssse3:
        movl    %ebx,%esi
        roll    $5,%ebx
        movdqa  %xmm3,%xmm9
-       movdqa  %xmm10,32(%rsp)
        addl    %edi,%eax
        xorl    %edx,%esi
+       movdqa  %xmm10,32(%rsp)
        rorl    $7,%ecx
        addl    %ebx,%eax
-       pslld   $2,%xmm3
        addl    56(%rsp),%ebp
+       pslld   $2,%xmm3
        xorl    %ecx,%esi
-       psrld   $30,%xmm9
        movl    %eax,%edi
+       psrld   $30,%xmm9
        roll    $5,%eax
        addl    %esi,%ebp
        xorl    %ecx,%edi
        rorl    $7,%ebx
-       addl    %eax,%ebp
        por     %xmm9,%xmm3
+       addl    %eax,%ebp
        addl    60(%rsp),%edx
+       pshufd  $238,%xmm2,%xmm10
        xorl    %ebx,%edi
-       movdqa  %xmm3,%xmm10
        movl    %ebp,%esi
        roll    $5,%ebp
        addl    %edi,%edx
        xorl    %ebx,%esi
        rorl    $7,%eax
        addl    %ebp,%edx
-       addl    0(%rsp),%ecx
        pxor    %xmm0,%xmm4
-.byte  102,68,15,58,15,210,8
+       addl    0(%rsp),%ecx
        xorl    %eax,%esi
+       punpcklqdq      %xmm3,%xmm10
        movl    %edx,%edi
        roll    $5,%edx
        pxor    %xmm5,%xmm4
        addl    %esi,%ecx
        xorl    %eax,%edi
        movdqa  %xmm8,%xmm9
-       paddd   %xmm3,%xmm8
        rorl    $7,%ebp
+       paddd   %xmm3,%xmm8
        addl    %edx,%ecx
        pxor    %xmm10,%xmm4
        addl    4(%rsp),%ebx
@@ -1822,43 +1969,43 @@ L$oop_ssse3:
        movl    %ecx,%esi
        roll    $5,%ecx
        movdqa  %xmm4,%xmm10
-       movdqa  %xmm8,48(%rsp)
        addl    %edi,%ebx
        xorl    %ebp,%esi
+       movdqa  %xmm8,48(%rsp)
        rorl    $7,%edx
        addl    %ecx,%ebx
-       pslld   $2,%xmm4
        addl    8(%rsp),%eax
+       pslld   $2,%xmm4
        xorl    %edx,%esi
-       psrld   $30,%xmm10
        movl    %ebx,%edi
+       psrld   $30,%xmm10
        roll    $5,%ebx
        addl    %esi,%eax
        xorl    %edx,%edi
        rorl    $7,%ecx
-       addl    %ebx,%eax
        por     %xmm10,%xmm4
+       addl    %ebx,%eax
        addl    12(%rsp),%ebp
+       pshufd  $238,%xmm3,%xmm8
        xorl    %ecx,%edi
-       movdqa  %xmm4,%xmm8
        movl    %eax,%esi
        roll    $5,%eax
        addl    %edi,%ebp
        xorl    %ecx,%esi
        rorl    $7,%ebx
        addl    %eax,%ebp
-       addl    16(%rsp),%edx
        pxor    %xmm1,%xmm5
-.byte  102,68,15,58,15,195,8
+       addl    16(%rsp),%edx
        xorl    %ebx,%esi
+       punpcklqdq      %xmm4,%xmm8
        movl    %ebp,%edi
        roll    $5,%ebp
        pxor    %xmm6,%xmm5
        addl    %esi,%edx
        xorl    %ebx,%edi
        movdqa  %xmm9,%xmm10
-       paddd   %xmm4,%xmm9
        rorl    $7,%eax
+       paddd   %xmm4,%xmm9
        addl    %ebp,%edx
        pxor    %xmm8,%xmm5
        addl    20(%rsp),%ecx
@@ -1866,24 +2013,24 @@ L$oop_ssse3:
        movl    %edx,%esi
        roll    $5,%edx
        movdqa  %xmm5,%xmm8
-       movdqa  %xmm9,0(%rsp)
        addl    %edi,%ecx
        xorl    %eax,%esi
+       movdqa  %xmm9,0(%rsp)
        rorl    $7,%ebp
        addl    %edx,%ecx
-       pslld   $2,%xmm5
        addl    24(%rsp),%ebx
+       pslld   $2,%xmm5
        xorl    %ebp,%esi
-       psrld   $30,%xmm8
        movl    %ecx,%edi
+       psrld   $30,%xmm8
        roll    $5,%ecx
        addl    %esi,%ebx
        xorl    %ebp,%edi
        rorl    $7,%edx
-       addl    %ecx,%ebx
        por     %xmm8,%xmm5
+       addl    %ecx,%ebx
        addl    28(%rsp),%eax
-       movdqa  %xmm5,%xmm9
+       pshufd  $238,%xmm4,%xmm9
        rorl    $7,%ecx
        movl    %ebx,%esi
        xorl    %edx,%edi
@@ -1892,47 +2039,47 @@ L$oop_ssse3:
        xorl    %ecx,%esi
        xorl    %edx,%ecx
        addl    %ebx,%eax
-       addl    32(%rsp),%ebp
        pxor    %xmm2,%xmm6
-.byte  102,68,15,58,15,204,8
+       addl    32(%rsp),%ebp
        andl    %ecx,%esi
        xorl    %edx,%ecx
        rorl    $7,%ebx
-       pxor    %xmm7,%xmm6
+       punpcklqdq      %xmm5,%xmm9
        movl    %eax,%edi
        xorl    %ecx,%esi
-       movdqa  %xmm10,%xmm8
-       paddd   %xmm5,%xmm10
+       pxor    %xmm7,%xmm6
        roll    $5,%eax
        addl    %esi,%ebp
-       pxor    %xmm9,%xmm6
+       movdqa  %xmm10,%xmm8
        xorl    %ebx,%edi
+       paddd   %xmm5,%xmm10
        xorl    %ecx,%ebx
+       pxor    %xmm9,%xmm6
        addl    %eax,%ebp
        addl    36(%rsp),%edx
-       movdqa  %xmm6,%xmm9
-       movdqa  %xmm10,16(%rsp)
        andl    %ebx,%edi
        xorl    %ecx,%ebx
        rorl    $7,%eax
+       movdqa  %xmm6,%xmm9
        movl    %ebp,%esi
-       pslld   $2,%xmm6
        xorl    %ebx,%edi
+       movdqa  %xmm10,16(%rsp)
        roll    $5,%ebp
-       psrld   $30,%xmm9
        addl    %edi,%edx
        xorl    %eax,%esi
+       pslld   $2,%xmm6
        xorl    %ebx,%eax
        addl    %ebp,%edx
+       psrld   $30,%xmm9
        addl    40(%rsp),%ecx
        andl    %eax,%esi
-       por     %xmm9,%xmm6
        xorl    %ebx,%eax
+       por     %xmm9,%xmm6
        rorl    $7,%ebp
-       movdqa  %xmm6,%xmm10
        movl    %edx,%edi
        xorl    %eax,%esi
        roll    $5,%edx
+       pshufd  $238,%xmm5,%xmm10
        addl    %esi,%ecx
        xorl    %ebp,%edi
        xorl    %eax,%ebp
@@ -1948,47 +2095,47 @@ L$oop_ssse3:
        xorl    %edx,%esi
        xorl    %ebp,%edx
        addl    %ecx,%ebx
-       addl    48(%rsp),%eax
        pxor    %xmm3,%xmm7
-.byte  102,68,15,58,15,213,8
+       addl    48(%rsp),%eax
        andl    %edx,%esi
        xorl    %ebp,%edx
        rorl    $7,%ecx
-       pxor    %xmm0,%xmm7
+       punpcklqdq      %xmm6,%xmm10
        movl    %ebx,%edi
        xorl    %edx,%esi
-       movdqa  48(%r11),%xmm9
-       paddd   %xmm6,%xmm8
+       pxor    %xmm0,%xmm7
        roll    $5,%ebx
        addl    %esi,%eax
-       pxor    %xmm10,%xmm7
+       movdqa  32(%r14),%xmm9
        xorl    %ecx,%edi
+       paddd   %xmm6,%xmm8
        xorl    %edx,%ecx
+       pxor    %xmm10,%xmm7
        addl    %ebx,%eax
        addl    52(%rsp),%ebp
-       movdqa  %xmm7,%xmm10
-       movdqa  %xmm8,32(%rsp)
        andl    %ecx,%edi
        xorl    %edx,%ecx
        rorl    $7,%ebx
+       movdqa  %xmm7,%xmm10
        movl    %eax,%esi
-       pslld   $2,%xmm7
        xorl    %ecx,%edi
+       movdqa  %xmm8,32(%rsp)
        roll    $5,%eax
-       psrld   $30,%xmm10
        addl    %edi,%ebp
        xorl    %ebx,%esi
+       pslld   $2,%xmm7
        xorl    %ecx,%ebx
        addl    %eax,%ebp
+       psrld   $30,%xmm10
        addl    56(%rsp),%edx
        andl    %ebx,%esi
-       por     %xmm10,%xmm7
        xorl    %ecx,%ebx
+       por     %xmm10,%xmm7
        rorl    $7,%eax
-       movdqa  %xmm7,%xmm8
        movl    %ebp,%edi
        xorl    %ebx,%esi
        roll    $5,%ebp
+       pshufd  $238,%xmm6,%xmm8
        addl    %esi,%edx
        xorl    %eax,%edi
        xorl    %ebx,%eax
@@ -2004,47 +2151,47 @@ L$oop_ssse3:
        xorl    %ebp,%esi
        xorl    %eax,%ebp
        addl    %edx,%ecx
-       addl    0(%rsp),%ebx
        pxor    %xmm4,%xmm0
-.byte  102,68,15,58,15,198,8
+       addl    0(%rsp),%ebx
        andl    %ebp,%esi
        xorl    %eax,%ebp
        rorl    $7,%edx
-       pxor    %xmm1,%xmm0
+       punpcklqdq      %xmm7,%xmm8
        movl    %ecx,%edi
        xorl    %ebp,%esi
-       movdqa  %xmm9,%xmm10
-       paddd   %xmm7,%xmm9
+       pxor    %xmm1,%xmm0
        roll    $5,%ecx
        addl    %esi,%ebx
-       pxor    %xmm8,%xmm0
+       movdqa  %xmm9,%xmm10
        xorl    %edx,%edi
+       paddd   %xmm7,%xmm9
        xorl    %ebp,%edx
+       pxor    %xmm8,%xmm0
        addl    %ecx,%ebx
        addl    4(%rsp),%eax
-       movdqa  %xmm0,%xmm8
-       movdqa  %xmm9,48(%rsp)
        andl    %edx,%edi
        xorl    %ebp,%edx
        rorl    $7,%ecx
+       movdqa  %xmm0,%xmm8
        movl    %ebx,%esi
-       pslld   $2,%xmm0
        xorl    %edx,%edi
+       movdqa  %xmm9,48(%rsp)
        roll    $5,%ebx
-       psrld   $30,%xmm8
        addl    %edi,%eax
        xorl    %ecx,%esi
+       pslld   $2,%xmm0
        xorl    %edx,%ecx
        addl    %ebx,%eax
+       psrld   $30,%xmm8
        addl    8(%rsp),%ebp
        andl    %ecx,%esi
-       por     %xmm8,%xmm0
        xorl    %edx,%ecx
+       por     %xmm8,%xmm0
        rorl    $7,%ebx
-       movdqa  %xmm0,%xmm9
        movl    %eax,%edi
        xorl    %ecx,%esi
        roll    $5,%eax
+       pshufd  $238,%xmm7,%xmm9
        addl    %esi,%ebp
        xorl    %ebx,%edi
        xorl    %ecx,%ebx
@@ -2060,47 +2207,47 @@ L$oop_ssse3:
        xorl    %eax,%esi
        xorl    %ebx,%eax
        addl    %ebp,%edx
-       addl    16(%rsp),%ecx
        pxor    %xmm5,%xmm1
-.byte  102,68,15,58,15,207,8
+       addl    16(%rsp),%ecx
        andl    %eax,%esi
        xorl    %ebx,%eax
        rorl    $7,%ebp
-       pxor    %xmm2,%xmm1
+       punpcklqdq      %xmm0,%xmm9
        movl    %edx,%edi
        xorl    %eax,%esi
-       movdqa  %xmm10,%xmm8
-       paddd   %xmm0,%xmm10
+       pxor    %xmm2,%xmm1
        roll    $5,%edx
        addl    %esi,%ecx
-       pxor    %xmm9,%xmm1
+       movdqa  %xmm10,%xmm8
        xorl    %ebp,%edi
+       paddd   %xmm0,%xmm10
        xorl    %eax,%ebp
+       pxor    %xmm9,%xmm1
        addl    %edx,%ecx
        addl    20(%rsp),%ebx
-       movdqa  %xmm1,%xmm9
-       movdqa  %xmm10,0(%rsp)
        andl    %ebp,%edi
        xorl    %eax,%ebp
        rorl    $7,%edx
+       movdqa  %xmm1,%xmm9
        movl    %ecx,%esi
-       pslld   $2,%xmm1
        xorl    %ebp,%edi
+       movdqa  %xmm10,0(%rsp)
        roll    $5,%ecx
-       psrld   $30,%xmm9
        addl    %edi,%ebx
        xorl    %edx,%esi
+       pslld   $2,%xmm1
        xorl    %ebp,%edx
        addl    %ecx,%ebx
+       psrld   $30,%xmm9
        addl    24(%rsp),%eax
        andl    %edx,%esi
-       por     %xmm9,%xmm1
        xorl    %ebp,%edx
+       por     %xmm9,%xmm1
        rorl    $7,%ecx
-       movdqa  %xmm1,%xmm10
        movl    %ebx,%edi
        xorl    %edx,%esi
        roll    $5,%ebx
+       pshufd  $238,%xmm0,%xmm10
        addl    %esi,%eax
        xorl    %ecx,%edi
        xorl    %edx,%ecx
@@ -2116,47 +2263,47 @@ L$oop_ssse3:
        xorl    %ebx,%esi
        xorl    %ecx,%ebx
        addl    %eax,%ebp
-       addl    32(%rsp),%edx
        pxor    %xmm6,%xmm2
-.byte  102,68,15,58,15,208,8
+       addl    32(%rsp),%edx
        andl    %ebx,%esi
        xorl    %ecx,%ebx
        rorl    $7,%eax
-       pxor    %xmm3,%xmm2
+       punpcklqdq      %xmm1,%xmm10
        movl    %ebp,%edi
        xorl    %ebx,%esi
-       movdqa  %xmm8,%xmm9
-       paddd   %xmm1,%xmm8
+       pxor    %xmm3,%xmm2
        roll    $5,%ebp
        addl    %esi,%edx
-       pxor    %xmm10,%xmm2
+       movdqa  %xmm8,%xmm9
        xorl    %eax,%edi
+       paddd   %xmm1,%xmm8
        xorl    %ebx,%eax
+       pxor    %xmm10,%xmm2
        addl    %ebp,%edx
        addl    36(%rsp),%ecx
-       movdqa  %xmm2,%xmm10
-       movdqa  %xmm8,16(%rsp)
        andl    %eax,%edi
        xorl    %ebx,%eax
        rorl    $7,%ebp
+       movdqa  %xmm2,%xmm10
        movl    %edx,%esi
-       pslld   $2,%xmm2
        xorl    %eax,%edi
+       movdqa  %xmm8,16(%rsp)
        roll    $5,%edx
-       psrld   $30,%xmm10
        addl    %edi,%ecx
        xorl    %ebp,%esi
+       pslld   $2,%xmm2
        xorl    %eax,%ebp
        addl    %edx,%ecx
+       psrld   $30,%xmm10
        addl    40(%rsp),%ebx
        andl    %ebp,%esi
-       por     %xmm10,%xmm2
        xorl    %eax,%ebp
+       por     %xmm10,%xmm2
        rorl    $7,%edx
-       movdqa  %xmm2,%xmm8
        movl    %ecx,%edi
        xorl    %ebp,%esi
        roll    $5,%ecx
+       pshufd  $238,%xmm1,%xmm8
        addl    %esi,%ebx
        xorl    %edx,%edi
        xorl    %ebp,%edx
@@ -2171,18 +2318,18 @@ L$oop_ssse3:
        addl    %edi,%eax
        xorl    %edx,%esi
        addl    %ebx,%eax
-       addl    48(%rsp),%ebp
        pxor    %xmm7,%xmm3
-.byte  102,68,15,58,15,193,8
+       addl    48(%rsp),%ebp
        xorl    %ecx,%esi
+       punpcklqdq      %xmm2,%xmm8
        movl    %eax,%edi
        roll    $5,%eax
        pxor    %xmm4,%xmm3
        addl    %esi,%ebp
        xorl    %ecx,%edi
        movdqa  %xmm9,%xmm10
-       paddd   %xmm2,%xmm9
        rorl    $7,%ebx
+       paddd   %xmm2,%xmm9
        addl    %eax,%ebp
        pxor    %xmm8,%xmm3
        addl    52(%rsp),%edx
@@ -2190,22 +2337,22 @@ L$oop_ssse3:
        movl    %ebp,%esi
        roll    $5,%ebp
        movdqa  %xmm3,%xmm8
-       movdqa  %xmm9,32(%rsp)
        addl    %edi,%edx
        xorl    %ebx,%esi
+       movdqa  %xmm9,32(%rsp)
        rorl    $7,%eax
        addl    %ebp,%edx
-       pslld   $2,%xmm3
        addl    56(%rsp),%ecx
+       pslld   $2,%xmm3
        xorl    %eax,%esi
-       psrld   $30,%xmm8
        movl    %edx,%edi
+       psrld   $30,%xmm8
        roll    $5,%edx
        addl    %esi,%ecx
        xorl    %eax,%edi
        rorl    $7,%ebp
-       addl    %edx,%ecx
        por     %xmm8,%xmm3
+       addl    %edx,%ecx
        addl    60(%rsp),%ebx
        xorl    %ebp,%edi
        movl    %ecx,%esi
@@ -2215,13 +2362,13 @@ L$oop_ssse3:
        rorl    $7,%edx
        addl    %ecx,%ebx
        addl    0(%rsp),%eax
-       paddd   %xmm3,%xmm10
        xorl    %edx,%esi
        movl    %ebx,%edi
        roll    $5,%ebx
+       paddd   %xmm3,%xmm10
        addl    %esi,%eax
-       movdqa  %xmm10,48(%rsp)
        xorl    %edx,%edi
+       movdqa  %xmm10,48(%rsp)
        rorl    $7,%ecx
        addl    %ebx,%eax
        addl    4(%rsp),%ebp
@@ -2250,8 +2397,8 @@ L$oop_ssse3:
        addl    %edx,%ecx
        cmpq    %r10,%r9
        je      L$done_ssse3
-       movdqa  64(%r11),%xmm6
-       movdqa  0(%r11),%xmm9
+       movdqa  64(%r14),%xmm6
+       movdqa  -64(%r14),%xmm9
        movdqu  0(%r9),%xmm0
        movdqu  16(%r9),%xmm1
        movdqu  32(%r9),%xmm2
@@ -2260,23 +2407,23 @@ L$oop_ssse3:
        addq    $64,%r9
        addl    16(%rsp),%ebx
        xorl    %ebp,%esi
-.byte  102,15,56,0,206
        movl    %ecx,%edi
+.byte  102,15,56,0,206
        roll    $5,%ecx
-       paddd   %xmm9,%xmm0
        addl    %esi,%ebx
        xorl    %ebp,%edi
        rorl    $7,%edx
+       paddd   %xmm9,%xmm0
        addl    %ecx,%ebx
-       movdqa  %xmm0,0(%rsp)
        addl    20(%rsp),%eax
        xorl    %edx,%edi
-       psubd   %xmm9,%xmm0
        movl    %ebx,%esi
+       movdqa  %xmm0,0(%rsp)
        roll    $5,%ebx
        addl    %edi,%eax
        xorl    %edx,%esi
        rorl    $7,%ecx
+       psubd   %xmm9,%xmm0
        addl    %ebx,%eax
        addl    24(%rsp),%ebp
        xorl    %ecx,%esi
@@ -2296,23 +2443,23 @@ L$oop_ssse3:
        addl    %ebp,%edx
        addl    32(%rsp),%ecx
        xorl    %eax,%esi
-.byte  102,15,56,0,214
        movl    %edx,%edi
+.byte  102,15,56,0,214
        roll    $5,%edx
-       paddd   %xmm9,%xmm1
        addl    %esi,%ecx
        xorl    %eax,%edi
        rorl    $7,%ebp
+       paddd   %xmm9,%xmm1
        addl    %edx,%ecx
-       movdqa  %xmm1,16(%rsp)
        addl    36(%rsp),%ebx
        xorl    %ebp,%edi
-       psubd   %xmm9,%xmm1
        movl    %ecx,%esi
+       movdqa  %xmm1,16(%rsp)
        roll    $5,%ecx
        addl    %edi,%ebx
        xorl    %ebp,%esi
        rorl    $7,%edx
+       psubd   %xmm9,%xmm1
        addl    %ecx,%ebx
        addl    40(%rsp),%eax
        xorl    %edx,%esi
@@ -2332,23 +2479,23 @@ L$oop_ssse3:
        addl    %eax,%ebp
        addl    48(%rsp),%edx
        xorl    %ebx,%esi
-.byte  102,15,56,0,222
        movl    %ebp,%edi
+.byte  102,15,56,0,222
        roll    $5,%ebp
-       paddd   %xmm9,%xmm2
        addl    %esi,%edx
        xorl    %ebx,%edi
        rorl    $7,%eax
+       paddd   %xmm9,%xmm2
        addl    %ebp,%edx
-       movdqa  %xmm2,32(%rsp)
        addl    52(%rsp),%ecx
        xorl    %eax,%edi
-       psubd   %xmm9,%xmm2
        movl    %edx,%esi
+       movdqa  %xmm2,32(%rsp)
        roll    $5,%edx
        addl    %edi,%ecx
        xorl    %eax,%esi
        rorl    $7,%ebp
+       psubd   %xmm9,%xmm2
        addl    %edx,%ecx
        addl    56(%rsp),%ebx
        xorl    %ebp,%esi
@@ -2488,21 +2635,2856 @@ L$done_ssse3:
        movl    %ecx,8(%r8)
        movl    %edx,12(%r8)
        movl    %ebp,16(%r8)
-       leaq    64(%rsp),%rsi
-       movq    0(%rsi),%r12
-       movq    8(%rsi),%rbp
-       movq    16(%rsi),%rbx
-       leaq    24(%rsi),%rsp
+       movq    -40(%r11),%r14
+
+       movq    -32(%r11),%r13
+
+       movq    -24(%r11),%r12
+
+       movq    -16(%r11),%rbp
+
+       movq    -8(%r11),%rbx
+
+       leaq    (%r11),%rsp
+
 L$epilogue_ssse3:
        .byte   0xf3,0xc3
 
+
+
+.p2align       4
+sha1_block_data_order_avx:
+_avx_shortcut:
+
+       movq    %rsp,%r11
+
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       leaq    -64(%rsp),%rsp
+       vzeroupper
+       andq    $-64,%rsp
+       movq    %rdi,%r8
+       movq    %rsi,%r9
+       movq    %rdx,%r10
+
+       shlq    $6,%r10
+       addq    %r9,%r10
+       leaq    K_XX_XX+64(%rip),%r14
+
+       movl    0(%r8),%eax
+       movl    4(%r8),%ebx
+       movl    8(%r8),%ecx
+       movl    12(%r8),%edx
+       movl    %ebx,%esi
+       movl    16(%r8),%ebp
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       andl    %edi,%esi
+
+       vmovdqa 64(%r14),%xmm6
+       vmovdqa -64(%r14),%xmm11
+       vmovdqu 0(%r9),%xmm0
+       vmovdqu 16(%r9),%xmm1
+       vmovdqu 32(%r9),%xmm2
+       vmovdqu 48(%r9),%xmm3
+       vpshufb %xmm6,%xmm0,%xmm0
+       addq    $64,%r9
+       vpshufb %xmm6,%xmm1,%xmm1
+       vpshufb %xmm6,%xmm2,%xmm2
+       vpshufb %xmm6,%xmm3,%xmm3
+       vpaddd  %xmm11,%xmm0,%xmm4
+       vpaddd  %xmm11,%xmm1,%xmm5
+       vpaddd  %xmm11,%xmm2,%xmm6
+       vmovdqa %xmm4,0(%rsp)
+       vmovdqa %xmm5,16(%rsp)
+       vmovdqa %xmm6,32(%rsp)
+       jmp     L$oop_avx
+.p2align       4
+L$oop_avx:
+       shrdl   $2,%ebx,%ebx
+       xorl    %edx,%esi
+       vpalignr        $8,%xmm0,%xmm1,%xmm4
+       movl    %eax,%edi
+       addl    0(%rsp),%ebp
+       vpaddd  %xmm3,%xmm11,%xmm9
+       xorl    %ecx,%ebx
+       shldl   $5,%eax,%eax
+       vpsrldq $4,%xmm3,%xmm8
+       addl    %esi,%ebp
+       andl    %ebx,%edi
+       vpxor   %xmm0,%xmm4,%xmm4
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       vpxor   %xmm2,%xmm8,%xmm8
+       shrdl   $7,%eax,%eax
+       xorl    %ecx,%edi
+       movl    %ebp,%esi
+       addl    4(%rsp),%edx
+       vpxor   %xmm8,%xmm4,%xmm4
+       xorl    %ebx,%eax
+       shldl   $5,%ebp,%ebp
+       vmovdqa %xmm9,48(%rsp)
+       addl    %edi,%edx
+       andl    %eax,%esi
+       vpsrld  $31,%xmm4,%xmm8
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       shrdl   $7,%ebp,%ebp
+       xorl    %ebx,%esi
+       vpslldq $12,%xmm4,%xmm10
+       vpaddd  %xmm4,%xmm4,%xmm4
+       movl    %edx,%edi
+       addl    8(%rsp),%ecx
+       xorl    %eax,%ebp
+       shldl   $5,%edx,%edx
+       vpsrld  $30,%xmm10,%xmm9
+       vpor    %xmm8,%xmm4,%xmm4
+       addl    %esi,%ecx
+       andl    %ebp,%edi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       vpslld  $2,%xmm10,%xmm10
+       vpxor   %xmm9,%xmm4,%xmm4
+       shrdl   $7,%edx,%edx
+       xorl    %eax,%edi
+       movl    %ecx,%esi
+       addl    12(%rsp),%ebx
+       vpxor   %xmm10,%xmm4,%xmm4
+       xorl    %ebp,%edx
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       andl    %edx,%esi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       shrdl   $7,%ecx,%ecx
+       xorl    %ebp,%esi
+       vpalignr        $8,%xmm1,%xmm2,%xmm5
+       movl    %ebx,%edi
+       addl    16(%rsp),%eax
+       vpaddd  %xmm4,%xmm11,%xmm9
+       xorl    %edx,%ecx
+       shldl   $5,%ebx,%ebx
+       vpsrldq $4,%xmm4,%xmm8
+       addl    %esi,%eax
+       andl    %ecx,%edi
+       vpxor   %xmm1,%xmm5,%xmm5
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       vpxor   %xmm3,%xmm8,%xmm8
+       shrdl   $7,%ebx,%ebx
+       xorl    %edx,%edi
+       movl    %eax,%esi
+       addl    20(%rsp),%ebp
+       vpxor   %xmm8,%xmm5,%xmm5
+       xorl    %ecx,%ebx
+       shldl   $5,%eax,%eax
+       vmovdqa %xmm9,0(%rsp)
+       addl    %edi,%ebp
+       andl    %ebx,%esi
+       vpsrld  $31,%xmm5,%xmm8
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       shrdl   $7,%eax,%eax
+       xorl    %ecx,%esi
+       vpslldq $12,%xmm5,%xmm10
+       vpaddd  %xmm5,%xmm5,%xmm5
+       movl    %ebp,%edi
+       addl    24(%rsp),%edx
+       xorl    %ebx,%eax
+       shldl   $5,%ebp,%ebp
+       vpsrld  $30,%xmm10,%xmm9
+       vpor    %xmm8,%xmm5,%xmm5
+       addl    %esi,%edx
+       andl    %eax,%edi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       vpslld  $2,%xmm10,%xmm10
+       vpxor   %xmm9,%xmm5,%xmm5
+       shrdl   $7,%ebp,%ebp
+       xorl    %ebx,%edi
+       movl    %edx,%esi
+       addl    28(%rsp),%ecx
+       vpxor   %xmm10,%xmm5,%xmm5
+       xorl    %eax,%ebp
+       shldl   $5,%edx,%edx
+       vmovdqa -32(%r14),%xmm11
+       addl    %edi,%ecx
+       andl    %ebp,%esi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       shrdl   $7,%edx,%edx
+       xorl    %eax,%esi
+       vpalignr        $8,%xmm2,%xmm3,%xmm6
+       movl    %ecx,%edi
+       addl    32(%rsp),%ebx
+       vpaddd  %xmm5,%xmm11,%xmm9
+       xorl    %ebp,%edx
+       shldl   $5,%ecx,%ecx
+       vpsrldq $4,%xmm5,%xmm8
+       addl    %esi,%ebx
+       andl    %edx,%edi
+       vpxor   %xmm2,%xmm6,%xmm6
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       vpxor   %xmm4,%xmm8,%xmm8
+       shrdl   $7,%ecx,%ecx
+       xorl    %ebp,%edi
+       movl    %ebx,%esi
+       addl    36(%rsp),%eax
+       vpxor   %xmm8,%xmm6,%xmm6
+       xorl    %edx,%ecx
+       shldl   $5,%ebx,%ebx
+       vmovdqa %xmm9,16(%rsp)
+       addl    %edi,%eax
+       andl    %ecx,%esi
+       vpsrld  $31,%xmm6,%xmm8
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       shrdl   $7,%ebx,%ebx
+       xorl    %edx,%esi
+       vpslldq $12,%xmm6,%xmm10
+       vpaddd  %xmm6,%xmm6,%xmm6
+       movl    %eax,%edi
+       addl    40(%rsp),%ebp
+       xorl    %ecx,%ebx
+       shldl   $5,%eax,%eax
+       vpsrld  $30,%xmm10,%xmm9
+       vpor    %xmm8,%xmm6,%xmm6
+       addl    %esi,%ebp
+       andl    %ebx,%edi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       vpslld  $2,%xmm10,%xmm10
+       vpxor   %xmm9,%xmm6,%xmm6
+       shrdl   $7,%eax,%eax
+       xorl    %ecx,%edi
+       movl    %ebp,%esi
+       addl    44(%rsp),%edx
+       vpxor   %xmm10,%xmm6,%xmm6
+       xorl    %ebx,%eax
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       andl    %eax,%esi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       shrdl   $7,%ebp,%ebp
+       xorl    %ebx,%esi
+       vpalignr        $8,%xmm3,%xmm4,%xmm7
+       movl    %edx,%edi
+       addl    48(%rsp),%ecx
+       vpaddd  %xmm6,%xmm11,%xmm9
+       xorl    %eax,%ebp
+       shldl   $5,%edx,%edx
+       vpsrldq $4,%xmm6,%xmm8
+       addl    %esi,%ecx
+       andl    %ebp,%edi
+       vpxor   %xmm3,%xmm7,%xmm7
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       vpxor   %xmm5,%xmm8,%xmm8
+       shrdl   $7,%edx,%edx
+       xorl    %eax,%edi
+       movl    %ecx,%esi
+       addl    52(%rsp),%ebx
+       vpxor   %xmm8,%xmm7,%xmm7
+       xorl    %ebp,%edx
+       shldl   $5,%ecx,%ecx
+       vmovdqa %xmm9,32(%rsp)
+       addl    %edi,%ebx
+       andl    %edx,%esi
+       vpsrld  $31,%xmm7,%xmm8
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       shrdl   $7,%ecx,%ecx
+       xorl    %ebp,%esi
+       vpslldq $12,%xmm7,%xmm10
+       vpaddd  %xmm7,%xmm7,%xmm7
+       movl    %ebx,%edi
+       addl    56(%rsp),%eax
+       xorl    %edx,%ecx
+       shldl   $5,%ebx,%ebx
+       vpsrld  $30,%xmm10,%xmm9
+       vpor    %xmm8,%xmm7,%xmm7
+       addl    %esi,%eax
+       andl    %ecx,%edi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       vpslld  $2,%xmm10,%xmm10
+       vpxor   %xmm9,%xmm7,%xmm7
+       shrdl   $7,%ebx,%ebx
+       xorl    %edx,%edi
+       movl    %eax,%esi
+       addl    60(%rsp),%ebp
+       vpxor   %xmm10,%xmm7,%xmm7
+       xorl    %ecx,%ebx
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       andl    %ebx,%esi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       vpalignr        $8,%xmm6,%xmm7,%xmm8
+       vpxor   %xmm4,%xmm0,%xmm0
+       shrdl   $7,%eax,%eax
+       xorl    %ecx,%esi
+       movl    %ebp,%edi
+       addl    0(%rsp),%edx
+       vpxor   %xmm1,%xmm0,%xmm0
+       xorl    %ebx,%eax
+       shldl   $5,%ebp,%ebp
+       vpaddd  %xmm7,%xmm11,%xmm9
+       addl    %esi,%edx
+       andl    %eax,%edi
+       vpxor   %xmm8,%xmm0,%xmm0
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       shrdl   $7,%ebp,%ebp
+       xorl    %ebx,%edi
+       vpsrld  $30,%xmm0,%xmm8
+       vmovdqa %xmm9,48(%rsp)
+       movl    %edx,%esi
+       addl    4(%rsp),%ecx
+       xorl    %eax,%ebp
+       shldl   $5,%edx,%edx
+       vpslld  $2,%xmm0,%xmm0
+       addl    %edi,%ecx
+       andl    %ebp,%esi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       shrdl   $7,%edx,%edx
+       xorl    %eax,%esi
+       movl    %ecx,%edi
+       addl    8(%rsp),%ebx
+       vpor    %xmm8,%xmm0,%xmm0
+       xorl    %ebp,%edx
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       andl    %edx,%edi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       addl    12(%rsp),%eax
+       xorl    %ebp,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vpalignr        $8,%xmm7,%xmm0,%xmm8
+       vpxor   %xmm5,%xmm1,%xmm1
+       addl    16(%rsp),%ebp
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       vpxor   %xmm2,%xmm1,%xmm1
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       vpaddd  %xmm0,%xmm11,%xmm9
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpxor   %xmm8,%xmm1,%xmm1
+       addl    20(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       vpsrld  $30,%xmm1,%xmm8
+       vmovdqa %xmm9,0(%rsp)
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpslld  $2,%xmm1,%xmm1
+       addl    24(%rsp),%ecx
+       xorl    %eax,%esi
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       xorl    %eax,%edi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpor    %xmm8,%xmm1,%xmm1
+       addl    28(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vpalignr        $8,%xmm0,%xmm1,%xmm8
+       vpxor   %xmm6,%xmm2,%xmm2
+       addl    32(%rsp),%eax
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       vpxor   %xmm3,%xmm2,%xmm2
+       addl    %esi,%eax
+       xorl    %edx,%edi
+       vpaddd  %xmm1,%xmm11,%xmm9
+       vmovdqa 0(%r14),%xmm11
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vpxor   %xmm8,%xmm2,%xmm2
+       addl    36(%rsp),%ebp
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       vpsrld  $30,%xmm2,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpslld  $2,%xmm2,%xmm2
+       addl    40(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpor    %xmm8,%xmm2,%xmm2
+       addl    44(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpalignr        $8,%xmm1,%xmm2,%xmm8
+       vpxor   %xmm7,%xmm3,%xmm3
+       addl    48(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       vpxor   %xmm4,%xmm3,%xmm3
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       vpaddd  %xmm2,%xmm11,%xmm9
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vpxor   %xmm8,%xmm3,%xmm3
+       addl    52(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       vpsrld  $30,%xmm3,%xmm8
+       vmovdqa %xmm9,32(%rsp)
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vpslld  $2,%xmm3,%xmm3
+       addl    56(%rsp),%ebp
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpor    %xmm8,%xmm3,%xmm3
+       addl    60(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpalignr        $8,%xmm2,%xmm3,%xmm8
+       vpxor   %xmm0,%xmm4,%xmm4
+       addl    0(%rsp),%ecx
+       xorl    %eax,%esi
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       vpxor   %xmm5,%xmm4,%xmm4
+       addl    %esi,%ecx
+       xorl    %eax,%edi
+       vpaddd  %xmm3,%xmm11,%xmm9
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpxor   %xmm8,%xmm4,%xmm4
+       addl    4(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       vpsrld  $30,%xmm4,%xmm8
+       vmovdqa %xmm9,48(%rsp)
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vpslld  $2,%xmm4,%xmm4
+       addl    8(%rsp),%eax
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       xorl    %edx,%edi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vpor    %xmm8,%xmm4,%xmm4
+       addl    12(%rsp),%ebp
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpalignr        $8,%xmm3,%xmm4,%xmm8
+       vpxor   %xmm1,%xmm5,%xmm5
+       addl    16(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       vpxor   %xmm6,%xmm5,%xmm5
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       vpaddd  %xmm4,%xmm11,%xmm9
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpxor   %xmm8,%xmm5,%xmm5
+       addl    20(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       vpsrld  $30,%xmm5,%xmm8
+       vmovdqa %xmm9,0(%rsp)
+       addl    %edi,%ecx
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpslld  $2,%xmm5,%xmm5
+       addl    24(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vpor    %xmm8,%xmm5,%xmm5
+       addl    28(%rsp),%eax
+       shrdl   $7,%ecx,%ecx
+       movl    %ebx,%esi
+       xorl    %edx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %ecx,%esi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       vpalignr        $8,%xmm4,%xmm5,%xmm8
+       vpxor   %xmm2,%xmm6,%xmm6
+       addl    32(%rsp),%ebp
+       andl    %ecx,%esi
+       xorl    %edx,%ecx
+       shrdl   $7,%ebx,%ebx
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %eax,%edi
+       xorl    %ecx,%esi
+       vpaddd  %xmm5,%xmm11,%xmm9
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       vpxor   %xmm8,%xmm6,%xmm6
+       xorl    %ebx,%edi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       addl    36(%rsp),%edx
+       vpsrld  $30,%xmm6,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       andl    %ebx,%edi
+       xorl    %ecx,%ebx
+       shrdl   $7,%eax,%eax
+       movl    %ebp,%esi
+       vpslld  $2,%xmm6,%xmm6
+       xorl    %ebx,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %eax,%esi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       addl    40(%rsp),%ecx
+       andl    %eax,%esi
+       vpor    %xmm8,%xmm6,%xmm6
+       xorl    %ebx,%eax
+       shrdl   $7,%ebp,%ebp
+       movl    %edx,%edi
+       xorl    %eax,%esi
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       xorl    %ebp,%edi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       addl    44(%rsp),%ebx
+       andl    %ebp,%edi
+       xorl    %eax,%ebp
+       shrdl   $7,%edx,%edx
+       movl    %ecx,%esi
+       xorl    %ebp,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %edx,%esi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       vpalignr        $8,%xmm5,%xmm6,%xmm8
+       vpxor   %xmm3,%xmm7,%xmm7
+       addl    48(%rsp),%eax
+       andl    %edx,%esi
+       xorl    %ebp,%edx
+       shrdl   $7,%ecx,%ecx
+       vpxor   %xmm0,%xmm7,%xmm7
+       movl    %ebx,%edi
+       xorl    %edx,%esi
+       vpaddd  %xmm6,%xmm11,%xmm9
+       vmovdqa 32(%r14),%xmm11
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       vpxor   %xmm8,%xmm7,%xmm7
+       xorl    %ecx,%edi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       addl    52(%rsp),%ebp
+       vpsrld  $30,%xmm7,%xmm8
+       vmovdqa %xmm9,32(%rsp)
+       andl    %ecx,%edi
+       xorl    %edx,%ecx
+       shrdl   $7,%ebx,%ebx
+       movl    %eax,%esi
+       vpslld  $2,%xmm7,%xmm7
+       xorl    %ecx,%edi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ebx,%esi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       addl    56(%rsp),%edx
+       andl    %ebx,%esi
+       vpor    %xmm8,%xmm7,%xmm7
+       xorl    %ecx,%ebx
+       shrdl   $7,%eax,%eax
+       movl    %ebp,%edi
+       xorl    %ebx,%esi
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       xorl    %eax,%edi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       addl    60(%rsp),%ecx
+       andl    %eax,%edi
+       xorl    %ebx,%eax
+       shrdl   $7,%ebp,%ebp
+       movl    %edx,%esi
+       xorl    %eax,%edi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %ebp,%esi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       vpalignr        $8,%xmm6,%xmm7,%xmm8
+       vpxor   %xmm4,%xmm0,%xmm0
+       addl    0(%rsp),%ebx
+       andl    %ebp,%esi
+       xorl    %eax,%ebp
+       shrdl   $7,%edx,%edx
+       vpxor   %xmm1,%xmm0,%xmm0
+       movl    %ecx,%edi
+       xorl    %ebp,%esi
+       vpaddd  %xmm7,%xmm11,%xmm9
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       vpxor   %xmm8,%xmm0,%xmm0
+       xorl    %edx,%edi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       addl    4(%rsp),%eax
+       vpsrld  $30,%xmm0,%xmm8
+       vmovdqa %xmm9,48(%rsp)
+       andl    %edx,%edi
+       xorl    %ebp,%edx
+       shrdl   $7,%ecx,%ecx
+       movl    %ebx,%esi
+       vpslld  $2,%xmm0,%xmm0
+       xorl    %edx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %ecx,%esi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       addl    8(%rsp),%ebp
+       andl    %ecx,%esi
+       vpor    %xmm8,%xmm0,%xmm0
+       xorl    %edx,%ecx
+       shrdl   $7,%ebx,%ebx
+       movl    %eax,%edi
+       xorl    %ecx,%esi
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       xorl    %ebx,%edi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       addl    12(%rsp),%edx
+       andl    %ebx,%edi
+       xorl    %ecx,%ebx
+       shrdl   $7,%eax,%eax
+       movl    %ebp,%esi
+       xorl    %ebx,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %eax,%esi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       vpalignr        $8,%xmm7,%xmm0,%xmm8
+       vpxor   %xmm5,%xmm1,%xmm1
+       addl    16(%rsp),%ecx
+       andl    %eax,%esi
+       xorl    %ebx,%eax
+       shrdl   $7,%ebp,%ebp
+       vpxor   %xmm2,%xmm1,%xmm1
+       movl    %edx,%edi
+       xorl    %eax,%esi
+       vpaddd  %xmm0,%xmm11,%xmm9
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       vpxor   %xmm8,%xmm1,%xmm1
+       xorl    %ebp,%edi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       addl    20(%rsp),%ebx
+       vpsrld  $30,%xmm1,%xmm8
+       vmovdqa %xmm9,0(%rsp)
+       andl    %ebp,%edi
+       xorl    %eax,%ebp
+       shrdl   $7,%edx,%edx
+       movl    %ecx,%esi
+       vpslld  $2,%xmm1,%xmm1
+       xorl    %ebp,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %edx,%esi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       addl    24(%rsp),%eax
+       andl    %edx,%esi
+       vpor    %xmm8,%xmm1,%xmm1
+       xorl    %ebp,%edx
+       shrdl   $7,%ecx,%ecx
+       movl    %ebx,%edi
+       xorl    %edx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       xorl    %ecx,%edi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       addl    28(%rsp),%ebp
+       andl    %ecx,%edi
+       xorl    %edx,%ecx
+       shrdl   $7,%ebx,%ebx
+       movl    %eax,%esi
+       xorl    %ecx,%edi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ebx,%esi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       vpalignr        $8,%xmm0,%xmm1,%xmm8
+       vpxor   %xmm6,%xmm2,%xmm2
+       addl    32(%rsp),%edx
+       andl    %ebx,%esi
+       xorl    %ecx,%ebx
+       shrdl   $7,%eax,%eax
+       vpxor   %xmm3,%xmm2,%xmm2
+       movl    %ebp,%edi
+       xorl    %ebx,%esi
+       vpaddd  %xmm1,%xmm11,%xmm9
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       vpxor   %xmm8,%xmm2,%xmm2
+       xorl    %eax,%edi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       addl    36(%rsp),%ecx
+       vpsrld  $30,%xmm2,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       andl    %eax,%edi
+       xorl    %ebx,%eax
+       shrdl   $7,%ebp,%ebp
+       movl    %edx,%esi
+       vpslld  $2,%xmm2,%xmm2
+       xorl    %eax,%edi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %ebp,%esi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       addl    40(%rsp),%ebx
+       andl    %ebp,%esi
+       vpor    %xmm8,%xmm2,%xmm2
+       xorl    %eax,%ebp
+       shrdl   $7,%edx,%edx
+       movl    %ecx,%edi
+       xorl    %ebp,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %edx,%edi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       addl    44(%rsp),%eax
+       andl    %edx,%edi
+       xorl    %ebp,%edx
+       shrdl   $7,%ecx,%ecx
+       movl    %ebx,%esi
+       xorl    %edx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       addl    %ebx,%eax
+       vpalignr        $8,%xmm1,%xmm2,%xmm8
+       vpxor   %xmm7,%xmm3,%xmm3
+       addl    48(%rsp),%ebp
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       vpxor   %xmm4,%xmm3,%xmm3
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       vpaddd  %xmm2,%xmm11,%xmm9
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpxor   %xmm8,%xmm3,%xmm3
+       addl    52(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       vpsrld  $30,%xmm3,%xmm8
+       vmovdqa %xmm9,32(%rsp)
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpslld  $2,%xmm3,%xmm3
+       addl    56(%rsp),%ecx
+       xorl    %eax,%esi
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       xorl    %eax,%edi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpor    %xmm8,%xmm3,%xmm3
+       addl    60(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    0(%rsp),%eax
+       vpaddd  %xmm3,%xmm11,%xmm9
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       vmovdqa %xmm9,48(%rsp)
+       xorl    %edx,%edi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    4(%rsp),%ebp
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    8(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       addl    12(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       cmpq    %r10,%r9
+       je      L$done_avx
+       vmovdqa 64(%r14),%xmm6
+       vmovdqa -64(%r14),%xmm11
+       vmovdqu 0(%r9),%xmm0
+       vmovdqu 16(%r9),%xmm1
+       vmovdqu 32(%r9),%xmm2
+       vmovdqu 48(%r9),%xmm3
+       vpshufb %xmm6,%xmm0,%xmm0
+       addq    $64,%r9
+       addl    16(%rsp),%ebx
+       xorl    %ebp,%esi
+       vpshufb %xmm6,%xmm1,%xmm1
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       vpaddd  %xmm11,%xmm0,%xmm4
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vmovdqa %xmm4,0(%rsp)
+       addl    20(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    24(%rsp),%ebp
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    28(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       addl    32(%rsp),%ecx
+       xorl    %eax,%esi
+       vpshufb %xmm6,%xmm2,%xmm2
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       vpaddd  %xmm11,%xmm1,%xmm5
+       addl    %esi,%ecx
+       xorl    %eax,%edi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vmovdqa %xmm5,16(%rsp)
+       addl    36(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    40(%rsp),%eax
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       xorl    %edx,%edi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    44(%rsp),%ebp
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    48(%rsp),%edx
+       xorl    %ebx,%esi
+       vpshufb %xmm6,%xmm3,%xmm3
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       vpaddd  %xmm11,%xmm2,%xmm6
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vmovdqa %xmm6,32(%rsp)
+       addl    52(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       addl    56(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    60(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    0(%r8),%eax
+       addl    4(%r8),%esi
+       addl    8(%r8),%ecx
+       addl    12(%r8),%edx
+       movl    %eax,0(%r8)
+       addl    16(%r8),%ebp
+       movl    %esi,4(%r8)
+       movl    %esi,%ebx
+       movl    %ecx,8(%r8)
+       movl    %ecx,%edi
+       movl    %edx,12(%r8)
+       xorl    %edx,%edi
+       movl    %ebp,16(%r8)
+       andl    %edi,%esi
+       jmp     L$oop_avx
+
+.p2align       4
+L$done_avx:
+       addl    16(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    20(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    24(%rsp),%ebp
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    28(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       addl    32(%rsp),%ecx
+       xorl    %eax,%esi
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       xorl    %eax,%edi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       addl    36(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    40(%rsp),%eax
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       xorl    %edx,%edi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    44(%rsp),%ebp
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    48(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       addl    52(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       addl    56(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    60(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vzeroupper
+
+       addl    0(%r8),%eax
+       addl    4(%r8),%esi
+       addl    8(%r8),%ecx
+       movl    %eax,0(%r8)
+       addl    12(%r8),%edx
+       movl    %esi,4(%r8)
+       addl    16(%r8),%ebp
+       movl    %ecx,8(%r8)
+       movl    %edx,12(%r8)
+       movl    %ebp,16(%r8)
+       movq    -40(%r11),%r14
+
+       movq    -32(%r11),%r13
+
+       movq    -24(%r11),%r12
+
+       movq    -16(%r11),%rbp
+
+       movq    -8(%r11),%rbx
+
+       leaq    (%r11),%rsp
+
+L$epilogue_avx:
+       .byte   0xf3,0xc3
+
+
+
+.p2align       4
+sha1_block_data_order_avx2:
+_avx2_shortcut:
+
+       movq    %rsp,%r11
+
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       vzeroupper
+       movq    %rdi,%r8
+       movq    %rsi,%r9
+       movq    %rdx,%r10
+
+       leaq    -640(%rsp),%rsp
+       shlq    $6,%r10
+       leaq    64(%r9),%r13
+       andq    $-128,%rsp
+       addq    %r9,%r10
+       leaq    K_XX_XX+64(%rip),%r14
+
+       movl    0(%r8),%eax
+       cmpq    %r10,%r13
+       cmovaeq %r9,%r13
+       movl    4(%r8),%ebp
+       movl    8(%r8),%ecx
+       movl    12(%r8),%edx
+       movl    16(%r8),%esi
+       vmovdqu 64(%r14),%ymm6
+
+       vmovdqu (%r9),%xmm0
+       vmovdqu 16(%r9),%xmm1
+       vmovdqu 32(%r9),%xmm2
+       vmovdqu 48(%r9),%xmm3
+       leaq    64(%r9),%r9
+       vinserti128     $1,(%r13),%ymm0,%ymm0
+       vinserti128     $1,16(%r13),%ymm1,%ymm1
+       vpshufb %ymm6,%ymm0,%ymm0
+       vinserti128     $1,32(%r13),%ymm2,%ymm2
+       vpshufb %ymm6,%ymm1,%ymm1
+       vinserti128     $1,48(%r13),%ymm3,%ymm3
+       vpshufb %ymm6,%ymm2,%ymm2
+       vmovdqu -64(%r14),%ymm11
+       vpshufb %ymm6,%ymm3,%ymm3
+
+       vpaddd  %ymm11,%ymm0,%ymm4
+       vpaddd  %ymm11,%ymm1,%ymm5
+       vmovdqu %ymm4,0(%rsp)
+       vpaddd  %ymm11,%ymm2,%ymm6
+       vmovdqu %ymm5,32(%rsp)
+       vpaddd  %ymm11,%ymm3,%ymm7
+       vmovdqu %ymm6,64(%rsp)
+       vmovdqu %ymm7,96(%rsp)
+       vpalignr        $8,%ymm0,%ymm1,%ymm4
+       vpsrldq $4,%ymm3,%ymm8
+       vpxor   %ymm0,%ymm4,%ymm4
+       vpxor   %ymm2,%ymm8,%ymm8
+       vpxor   %ymm8,%ymm4,%ymm4
+       vpsrld  $31,%ymm4,%ymm8
+       vpslldq $12,%ymm4,%ymm10
+       vpaddd  %ymm4,%ymm4,%ymm4
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm4,%ymm4
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm4,%ymm4
+       vpxor   %ymm10,%ymm4,%ymm4
+       vpaddd  %ymm11,%ymm4,%ymm9
+       vmovdqu %ymm9,128(%rsp)
+       vpalignr        $8,%ymm1,%ymm2,%ymm5
+       vpsrldq $4,%ymm4,%ymm8
+       vpxor   %ymm1,%ymm5,%ymm5
+       vpxor   %ymm3,%ymm8,%ymm8
+       vpxor   %ymm8,%ymm5,%ymm5
+       vpsrld  $31,%ymm5,%ymm8
+       vmovdqu -32(%r14),%ymm11
+       vpslldq $12,%ymm5,%ymm10
+       vpaddd  %ymm5,%ymm5,%ymm5
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm5,%ymm5
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm5,%ymm5
+       vpxor   %ymm10,%ymm5,%ymm5
+       vpaddd  %ymm11,%ymm5,%ymm9
+       vmovdqu %ymm9,160(%rsp)
+       vpalignr        $8,%ymm2,%ymm3,%ymm6
+       vpsrldq $4,%ymm5,%ymm8
+       vpxor   %ymm2,%ymm6,%ymm6
+       vpxor   %ymm4,%ymm8,%ymm8
+       vpxor   %ymm8,%ymm6,%ymm6
+       vpsrld  $31,%ymm6,%ymm8
+       vpslldq $12,%ymm6,%ymm10
+       vpaddd  %ymm6,%ymm6,%ymm6
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm6,%ymm6
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm6,%ymm6
+       vpxor   %ymm10,%ymm6,%ymm6
+       vpaddd  %ymm11,%ymm6,%ymm9
+       vmovdqu %ymm9,192(%rsp)
+       vpalignr        $8,%ymm3,%ymm4,%ymm7
+       vpsrldq $4,%ymm6,%ymm8
+       vpxor   %ymm3,%ymm7,%ymm7
+       vpxor   %ymm5,%ymm8,%ymm8
+       vpxor   %ymm8,%ymm7,%ymm7
+       vpsrld  $31,%ymm7,%ymm8
+       vpslldq $12,%ymm7,%ymm10
+       vpaddd  %ymm7,%ymm7,%ymm7
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm7,%ymm7
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm7,%ymm7
+       vpxor   %ymm10,%ymm7,%ymm7
+       vpaddd  %ymm11,%ymm7,%ymm9
+       vmovdqu %ymm9,224(%rsp)
+       leaq    128(%rsp),%r13
+       jmp     L$oop_avx2
+.p2align       5
+L$oop_avx2:
+       rorxl   $2,%ebp,%ebx
+       andnl   %edx,%ebp,%edi
+       andl    %ecx,%ebp
+       xorl    %edi,%ebp
+       jmp     L$align32_1
+.p2align       5
+L$align32_1:
+       vpalignr        $8,%ymm6,%ymm7,%ymm8
+       vpxor   %ymm4,%ymm0,%ymm0
+       addl    -128(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       vpxor   %ymm1,%ymm0,%ymm0
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       vpxor   %ymm8,%ymm0,%ymm0
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       vpsrld  $30,%ymm0,%ymm8
+       vpslld  $2,%ymm0,%ymm0
+       addl    -124(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       vpor    %ymm8,%ymm0,%ymm0
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -120(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       vpaddd  %ymm11,%ymm0,%ymm9
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       vmovdqu %ymm9,256(%rsp)
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       addl    -116(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       addl    -96(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       andl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       vpalignr        $8,%ymm7,%ymm0,%ymm8
+       vpxor   %ymm5,%ymm1,%ymm1
+       addl    -92(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       vpxor   %ymm2,%ymm1,%ymm1
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       vpxor   %ymm8,%ymm1,%ymm1
+       andl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       vpsrld  $30,%ymm1,%ymm8
+       vpslld  $2,%ymm1,%ymm1
+       addl    -88(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       vpor    %ymm8,%ymm1,%ymm1
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    -84(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       vpaddd  %ymm11,%ymm1,%ymm9
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       vmovdqu %ymm9,288(%rsp)
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -64(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       addl    -60(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       vpalignr        $8,%ymm0,%ymm1,%ymm8
+       vpxor   %ymm6,%ymm2,%ymm2
+       addl    -56(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       vpxor   %ymm3,%ymm2,%ymm2
+       vmovdqu 0(%r14),%ymm11
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       vpxor   %ymm8,%ymm2,%ymm2
+       andl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       vpsrld  $30,%ymm2,%ymm8
+       vpslld  $2,%ymm2,%ymm2
+       addl    -52(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       andl    %ecx,%ebp
+       vpor    %ymm8,%ymm2,%ymm2
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       addl    -32(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       vpaddd  %ymm11,%ymm2,%ymm9
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       vmovdqu %ymm9,320(%rsp)
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    -28(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -24(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       vpalignr        $8,%ymm1,%ymm2,%ymm8
+       vpxor   %ymm7,%ymm3,%ymm3
+       addl    -20(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       vpxor   %ymm4,%ymm3,%ymm3
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       vpxor   %ymm8,%ymm3,%ymm3
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       vpsrld  $30,%ymm3,%ymm8
+       vpslld  $2,%ymm3,%ymm3
+       addl    0(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       andl    %edx,%ebx
+       vpor    %ymm8,%ymm3,%ymm3
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       addl    4(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       vpaddd  %ymm11,%ymm3,%ymm9
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       andl    %ecx,%ebp
+       vmovdqu %ymm9,352(%rsp)
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       addl    8(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    12(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       vpalignr        $8,%ymm2,%ymm3,%ymm8
+       vpxor   %ymm0,%ymm4,%ymm4
+       addl    32(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       vpxor   %ymm8,%ymm4,%ymm4
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    36(%r13),%ebx
+       vpsrld  $30,%ymm4,%ymm8
+       vpslld  $2,%ymm4,%ymm4
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       vpor    %ymm8,%ymm4,%ymm4
+       addl    40(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       vpaddd  %ymm11,%ymm4,%ymm9
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    44(%r13),%eax
+       vmovdqu %ymm9,384(%rsp)
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    64(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       vpalignr        $8,%ymm3,%ymm4,%ymm8
+       vpxor   %ymm1,%ymm5,%ymm5
+       addl    68(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       vpxor   %ymm6,%ymm5,%ymm5
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       vpxor   %ymm8,%ymm5,%ymm5
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    72(%r13),%ecx
+       vpsrld  $30,%ymm5,%ymm8
+       vpslld  $2,%ymm5,%ymm5
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       vpor    %ymm8,%ymm5,%ymm5
+       addl    76(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       vpaddd  %ymm11,%ymm5,%ymm9
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    96(%r13),%ebp
+       vmovdqu %ymm9,416(%rsp)
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    100(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       vpalignr        $8,%ymm4,%ymm5,%ymm8
+       vpxor   %ymm2,%ymm6,%ymm6
+       addl    104(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       vpxor   %ymm7,%ymm6,%ymm6
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       vpxor   %ymm8,%ymm6,%ymm6
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    108(%r13),%edx
+       leaq    256(%r13),%r13
+       vpsrld  $30,%ymm6,%ymm8
+       vpslld  $2,%ymm6,%ymm6
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       vpor    %ymm8,%ymm6,%ymm6
+       addl    -128(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       vpaddd  %ymm11,%ymm6,%ymm9
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -124(%r13),%ebx
+       vmovdqu %ymm9,448(%rsp)
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -120(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       vpalignr        $8,%ymm5,%ymm6,%ymm8
+       vpxor   %ymm3,%ymm7,%ymm7
+       addl    -116(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       vpxor   %ymm0,%ymm7,%ymm7
+       vmovdqu 32(%r14),%ymm11
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       vpxor   %ymm8,%ymm7,%ymm7
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -96(%r13),%esi
+       vpsrld  $30,%ymm7,%ymm8
+       vpslld  $2,%ymm7,%ymm7
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       vpor    %ymm8,%ymm7,%ymm7
+       addl    -92(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       vpaddd  %ymm11,%ymm7,%ymm9
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    -88(%r13),%ecx
+       vmovdqu %ymm9,480(%rsp)
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -84(%r13),%ebx
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       jmp     L$align32_2
+.p2align       5
+L$align32_2:
+       vpalignr        $8,%ymm6,%ymm7,%ymm8
+       vpxor   %ymm4,%ymm0,%ymm0
+       addl    -64(%r13),%ebp
+       xorl    %esi,%ecx
+       vpxor   %ymm1,%ymm0,%ymm0
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       vpxor   %ymm8,%ymm0,%ymm0
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       vpsrld  $30,%ymm0,%ymm8
+       vpslld  $2,%ymm0,%ymm0
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    -60(%r13),%eax
+       xorl    %edx,%ebx
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       vpor    %ymm8,%ymm0,%ymm0
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       vpaddd  %ymm11,%ymm0,%ymm9
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    -56(%r13),%esi
+       xorl    %ecx,%ebp
+       vmovdqu %ymm9,512(%rsp)
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       addl    -52(%r13),%edx
+       xorl    %ebx,%eax
+       movl    %ebp,%edi
+       xorl    %ebx,%edi
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       andl    %edi,%esi
+       addl    -32(%r13),%ecx
+       xorl    %ebp,%esi
+       movl    %eax,%edi
+       xorl    %ebp,%edi
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       vpalignr        $8,%ymm7,%ymm0,%ymm8
+       vpxor   %ymm5,%ymm1,%ymm1
+       addl    -28(%r13),%ebx
+       xorl    %eax,%edx
+       vpxor   %ymm2,%ymm1,%ymm1
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       vpxor   %ymm8,%ymm1,%ymm1
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       vpsrld  $30,%ymm1,%ymm8
+       vpslld  $2,%ymm1,%ymm1
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    -24(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       vpor    %ymm8,%ymm1,%ymm1
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       vpaddd  %ymm11,%ymm1,%ymm9
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    -20(%r13),%eax
+       xorl    %edx,%ebx
+       vmovdqu %ymm9,544(%rsp)
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    0(%r13),%esi
+       xorl    %ecx,%ebp
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       addl    4(%r13),%edx
+       xorl    %ebx,%eax
+       movl    %ebp,%edi
+       xorl    %ebx,%edi
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       andl    %edi,%esi
+       vpalignr        $8,%ymm0,%ymm1,%ymm8
+       vpxor   %ymm6,%ymm2,%ymm2
+       addl    8(%r13),%ecx
+       xorl    %ebp,%esi
+       vpxor   %ymm3,%ymm2,%ymm2
+       movl    %eax,%edi
+       xorl    %ebp,%edi
+       leal    (%rcx,%rsi,1),%ecx
+       vpxor   %ymm8,%ymm2,%ymm2
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       vpsrld  $30,%ymm2,%ymm8
+       vpslld  $2,%ymm2,%ymm2
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       addl    12(%r13),%ebx
+       xorl    %eax,%edx
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       vpor    %ymm8,%ymm2,%ymm2
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       vpaddd  %ymm11,%ymm2,%ymm9
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    32(%r13),%ebp
+       xorl    %esi,%ecx
+       vmovdqu %ymm9,576(%rsp)
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    36(%r13),%eax
+       xorl    %edx,%ebx
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    40(%r13),%esi
+       xorl    %ecx,%ebp
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       vpalignr        $8,%ymm1,%ymm2,%ymm8
+       vpxor   %ymm7,%ymm3,%ymm3
+       addl    44(%r13),%edx
+       xorl    %ebx,%eax
+       vpxor   %ymm4,%ymm3,%ymm3
+       movl    %ebp,%edi
+       xorl    %ebx,%edi
+       leal    (%rdx,%rax,1),%edx
+       vpxor   %ymm8,%ymm3,%ymm3
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       vpsrld  $30,%ymm3,%ymm8
+       vpslld  $2,%ymm3,%ymm3
+       addl    %r12d,%edx
+       andl    %edi,%esi
+       addl    64(%r13),%ecx
+       xorl    %ebp,%esi
+       movl    %eax,%edi
+       xorl    %ebp,%edi
+       vpor    %ymm8,%ymm3,%ymm3
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       vpaddd  %ymm11,%ymm3,%ymm9
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       addl    68(%r13),%ebx
+       xorl    %eax,%edx
+       vmovdqu %ymm9,608(%rsp)
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    72(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    76(%r13),%eax
+       xorl    %edx,%ebx
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    96(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    100(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    104(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    108(%r13),%ebx
+       leaq    256(%r13),%r13
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -128(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    -124(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -120(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    -116(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    -96(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -92(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -88(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    -84(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -64(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    -60(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    -56(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -52(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -32(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    -28(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -24(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    -20(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       addl    %r12d,%edx
+       leaq    128(%r9),%r13
+       leaq    128(%r9),%rdi
+       cmpq    %r10,%r13
+       cmovaeq %r9,%r13
+
+
+       addl    0(%r8),%edx
+       addl    4(%r8),%esi
+       addl    8(%r8),%ebp
+       movl    %edx,0(%r8)
+       addl    12(%r8),%ebx
+       movl    %esi,4(%r8)
+       movl    %edx,%eax
+       addl    16(%r8),%ecx
+       movl    %ebp,%r12d
+       movl    %ebp,8(%r8)
+       movl    %ebx,%edx
+
+       movl    %ebx,12(%r8)
+       movl    %esi,%ebp
+       movl    %ecx,16(%r8)
+
+       movl    %ecx,%esi
+       movl    %r12d,%ecx
+
+
+       cmpq    %r10,%r9
+       je      L$done_avx2
+       vmovdqu 64(%r14),%ymm6
+       cmpq    %r10,%rdi
+       ja      L$ast_avx2
+
+       vmovdqu -64(%rdi),%xmm0
+       vmovdqu -48(%rdi),%xmm1
+       vmovdqu -32(%rdi),%xmm2
+       vmovdqu -16(%rdi),%xmm3
+       vinserti128     $1,0(%r13),%ymm0,%ymm0
+       vinserti128     $1,16(%r13),%ymm1,%ymm1
+       vinserti128     $1,32(%r13),%ymm2,%ymm2
+       vinserti128     $1,48(%r13),%ymm3,%ymm3
+       jmp     L$ast_avx2
+
+.p2align       5
+L$ast_avx2:
+       leaq    128+16(%rsp),%r13
+       rorxl   $2,%ebp,%ebx
+       andnl   %edx,%ebp,%edi
+       andl    %ecx,%ebp
+       xorl    %edi,%ebp
+       subq    $-128,%r9
+       addl    -128(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    -124(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -120(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       addl    -116(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       addl    -96(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       andl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       addl    -92(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       andl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       addl    -88(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    -84(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -64(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       addl    -60(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       addl    -56(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       andl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       addl    -52(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       andl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       addl    -32(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    -28(%r13),%edx
+       andnl   %ebx,%esi,%edi
+       addl    %eax,%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       andl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %edi,%esi
+       addl    -24(%r13),%ecx
+       andnl   %ebp,%edx,%edi
+       addl    %esi,%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       andl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %edi,%edx
+       addl    -20(%r13),%ebx
+       andnl   %eax,%ecx,%edi
+       addl    %edx,%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       andl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %edi,%ecx
+       addl    0(%r13),%ebp
+       andnl   %esi,%ebx,%edi
+       addl    %ecx,%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       andl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %edi,%ebx
+       addl    4(%r13),%eax
+       andnl   %edx,%ebp,%edi
+       addl    %ebx,%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       andl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edi,%ebp
+       addl    8(%r13),%esi
+       andnl   %ecx,%eax,%edi
+       addl    %ebp,%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       andl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %edi,%eax
+       addl    12(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    32(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    36(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    40(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    44(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    64(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       vmovdqu -64(%r14),%ymm11
+       vpshufb %ymm6,%ymm0,%ymm0
+       addl    68(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    72(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    76(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    96(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    100(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       vpshufb %ymm6,%ymm1,%ymm1
+       vpaddd  %ymm11,%ymm0,%ymm8
+       addl    104(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    108(%r13),%edx
+       leaq    256(%r13),%r13
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    -128(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -124(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -120(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       vmovdqu %ymm8,0(%rsp)
+       vpshufb %ymm6,%ymm2,%ymm2
+       vpaddd  %ymm11,%ymm1,%ymm9
+       addl    -116(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -96(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    -92(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       addl    -88(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -84(%r13),%ebx
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       vmovdqu %ymm9,32(%rsp)
+       vpshufb %ymm6,%ymm3,%ymm3
+       vpaddd  %ymm11,%ymm2,%ymm6
+       addl    -64(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    -60(%r13),%eax
+       xorl    %edx,%ebx
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    -56(%r13),%esi
+       xorl    %ecx,%ebp
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       addl    -52(%r13),%edx
+       xorl    %ebx,%eax
+       movl    %ebp,%edi
+       xorl    %ebx,%edi
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       andl    %edi,%esi
+       addl    -32(%r13),%ecx
+       xorl    %ebp,%esi
+       movl    %eax,%edi
+       xorl    %ebp,%edi
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       jmp     L$align32_3
+.p2align       5
+L$align32_3:
+       vmovdqu %ymm6,64(%rsp)
+       vpaddd  %ymm11,%ymm3,%ymm7
+       addl    -28(%r13),%ebx
+       xorl    %eax,%edx
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    -24(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    -20(%r13),%eax
+       xorl    %edx,%ebx
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    0(%r13),%esi
+       xorl    %ecx,%ebp
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       addl    4(%r13),%edx
+       xorl    %ebx,%eax
+       movl    %ebp,%edi
+       xorl    %ebx,%edi
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       andl    %edi,%esi
+       vmovdqu %ymm7,96(%rsp)
+       addl    8(%r13),%ecx
+       xorl    %ebp,%esi
+       movl    %eax,%edi
+       xorl    %ebp,%edi
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       addl    12(%r13),%ebx
+       xorl    %eax,%edx
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    32(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    36(%r13),%eax
+       xorl    %edx,%ebx
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       andl    %edi,%ebp
+       addl    40(%r13),%esi
+       xorl    %ecx,%ebp
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       andl    %edi,%eax
+       vpalignr        $8,%ymm0,%ymm1,%ymm4
+       addl    44(%r13),%edx
+       xorl    %ebx,%eax
+       movl    %ebp,%edi
+       xorl    %ebx,%edi
+       vpsrldq $4,%ymm3,%ymm8
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       vpxor   %ymm0,%ymm4,%ymm4
+       vpxor   %ymm2,%ymm8,%ymm8
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       vpxor   %ymm8,%ymm4,%ymm4
+       andl    %edi,%esi
+       addl    64(%r13),%ecx
+       xorl    %ebp,%esi
+       movl    %eax,%edi
+       vpsrld  $31,%ymm4,%ymm8
+       xorl    %ebp,%edi
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       vpslldq $12,%ymm4,%ymm10
+       vpaddd  %ymm4,%ymm4,%ymm4
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm4,%ymm4
+       addl    %r12d,%ecx
+       andl    %edi,%edx
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm4,%ymm4
+       addl    68(%r13),%ebx
+       xorl    %eax,%edx
+       vpxor   %ymm10,%ymm4,%ymm4
+       movl    %esi,%edi
+       xorl    %eax,%edi
+       leal    (%rbx,%rdx,1),%ebx
+       vpaddd  %ymm11,%ymm4,%ymm9
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       vmovdqu %ymm9,128(%rsp)
+       addl    %r12d,%ebx
+       andl    %edi,%ecx
+       addl    72(%r13),%ebp
+       xorl    %esi,%ecx
+       movl    %edx,%edi
+       xorl    %esi,%edi
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       andl    %edi,%ebx
+       addl    76(%r13),%eax
+       xorl    %edx,%ebx
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       vpalignr        $8,%ymm1,%ymm2,%ymm5
+       addl    96(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       vpsrldq $4,%ymm4,%ymm8
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       vpxor   %ymm1,%ymm5,%ymm5
+       vpxor   %ymm3,%ymm8,%ymm8
+       addl    100(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       vpxor   %ymm8,%ymm5,%ymm5
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       vpsrld  $31,%ymm5,%ymm8
+       vmovdqu -32(%r14),%ymm11
+       xorl    %ebx,%esi
+       addl    104(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       vpslldq $12,%ymm5,%ymm10
+       vpaddd  %ymm5,%ymm5,%ymm5
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm5,%ymm5
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm5,%ymm5
+       xorl    %ebp,%edx
+       addl    108(%r13),%ebx
+       leaq    256(%r13),%r13
+       vpxor   %ymm10,%ymm5,%ymm5
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       vpaddd  %ymm11,%ymm5,%ymm9
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       vmovdqu %ymm9,160(%rsp)
+       addl    -128(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       vpalignr        $8,%ymm2,%ymm3,%ymm6
+       addl    -124(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       vpsrldq $4,%ymm5,%ymm8
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       vpxor   %ymm2,%ymm6,%ymm6
+       vpxor   %ymm4,%ymm8,%ymm8
+       addl    -120(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       vpxor   %ymm8,%ymm6,%ymm6
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       vpsrld  $31,%ymm6,%ymm8
+       xorl    %ecx,%eax
+       addl    -116(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       vpslldq $12,%ymm6,%ymm10
+       vpaddd  %ymm6,%ymm6,%ymm6
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm6,%ymm6
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm6,%ymm6
+       xorl    %ebx,%esi
+       addl    -96(%r13),%ecx
+       vpxor   %ymm10,%ymm6,%ymm6
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       vpaddd  %ymm11,%ymm6,%ymm9
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       vmovdqu %ymm9,192(%rsp)
+       addl    -92(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       vpalignr        $8,%ymm3,%ymm4,%ymm7
+       addl    -88(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       vpsrldq $4,%ymm6,%ymm8
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       vpxor   %ymm3,%ymm7,%ymm7
+       vpxor   %ymm5,%ymm8,%ymm8
+       addl    -84(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       vpxor   %ymm8,%ymm7,%ymm7
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       vpsrld  $31,%ymm7,%ymm8
+       xorl    %edx,%ebp
+       addl    -64(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       vpslldq $12,%ymm7,%ymm10
+       vpaddd  %ymm7,%ymm7,%ymm7
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       vpsrld  $30,%ymm10,%ymm9
+       vpor    %ymm8,%ymm7,%ymm7
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       vpslld  $2,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm7,%ymm7
+       xorl    %ecx,%eax
+       addl    -60(%r13),%edx
+       vpxor   %ymm10,%ymm7,%ymm7
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       rorxl   $2,%esi,%eax
+       vpaddd  %ymm11,%ymm7,%ymm9
+       xorl    %ebp,%esi
+       addl    %r12d,%edx
+       xorl    %ebx,%esi
+       vmovdqu %ymm9,224(%rsp)
+       addl    -56(%r13),%ecx
+       leal    (%rcx,%rsi,1),%ecx
+       rorxl   $27,%edx,%r12d
+       rorxl   $2,%edx,%esi
+       xorl    %eax,%edx
+       addl    %r12d,%ecx
+       xorl    %ebp,%edx
+       addl    -52(%r13),%ebx
+       leal    (%rbx,%rdx,1),%ebx
+       rorxl   $27,%ecx,%r12d
+       rorxl   $2,%ecx,%edx
+       xorl    %esi,%ecx
+       addl    %r12d,%ebx
+       xorl    %eax,%ecx
+       addl    -32(%r13),%ebp
+       leal    (%rcx,%rbp,1),%ebp
+       rorxl   $27,%ebx,%r12d
+       rorxl   $2,%ebx,%ecx
+       xorl    %edx,%ebx
+       addl    %r12d,%ebp
+       xorl    %esi,%ebx
+       addl    -28(%r13),%eax
+       leal    (%rax,%rbx,1),%eax
+       rorxl   $27,%ebp,%r12d
+       rorxl   $2,%ebp,%ebx
+       xorl    %ecx,%ebp
+       addl    %r12d,%eax
+       xorl    %edx,%ebp
+       addl    -24(%r13),%esi
+       leal    (%rsi,%rbp,1),%esi
+       rorxl   $27,%eax,%r12d
+       rorxl   $2,%eax,%ebp
+       xorl    %ebx,%eax
+       addl    %r12d,%esi
+       xorl    %ecx,%eax
+       addl    -20(%r13),%edx
+       leal    (%rdx,%rax,1),%edx
+       rorxl   $27,%esi,%r12d
+       addl    %r12d,%edx
+       leaq    128(%rsp),%r13
+
+
+       addl    0(%r8),%edx
+       addl    4(%r8),%esi
+       addl    8(%r8),%ebp
+       movl    %edx,0(%r8)
+       addl    12(%r8),%ebx
+       movl    %esi,4(%r8)
+       movl    %edx,%eax
+       addl    16(%r8),%ecx
+       movl    %ebp,%r12d
+       movl    %ebp,8(%r8)
+       movl    %ebx,%edx
+
+       movl    %ebx,12(%r8)
+       movl    %esi,%ebp
+       movl    %ecx,16(%r8)
+
+       movl    %ecx,%esi
+       movl    %r12d,%ecx
+
+
+       cmpq    %r10,%r9
+       jbe     L$oop_avx2
+
+L$done_avx2:
+       vzeroupper
+       movq    -40(%r11),%r14
+
+       movq    -32(%r11),%r13
+
+       movq    -24(%r11),%r12
+
+       movq    -16(%r11),%rbp
+
+       movq    -8(%r11),%rbx
+
+       leaq    (%r11),%rsp
+
+L$epilogue_avx2:
+       .byte   0xf3,0xc3
+
+
 .p2align       6
 K_XX_XX:
-.long  0x5a827999,0x5a827999,0x5a827999,0x5a827999     
-.long  0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     
-.long  0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     
-.long  0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     
-.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     
+.long  0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long  0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long  0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long  0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long  0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long  0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long  0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long  0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte  0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
 .byte  83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .p2align       6
 
index 300212c915ae6edc8141d335c6456b3b72fdbc5f..8d257109caa348c4905f98b3b962877f4c382291 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,6 @@
 #
 # *** This file is auto-generated ***
 #
-.file  "sha512-586.s"
 .text
 .globl _sha256_block_data_order
 .align 4
@@ -63,20 +62,6 @@ L000pic_point:
        movl    %edi,4(%esp)
        movl    %eax,8(%esp)
        movl    %ebx,12(%esp)
-       movl    L__gnutls_x86_cpuid_s$non_lazy_ptr-L001K256(%ebp),%edx
-       movl    (%edx),%ecx
-       movl    4(%edx),%edx
-       testl   $1048576,%ecx
-       jnz     L002loop
-       testl   $2048,%edx
-       andl    $1073741824,%ecx
-       andl    $268435456,%edx
-       orl     %edx,%ecx
-       cmpl    $1342177280,%ecx
-       je      L003loop_shrd
-       subl    %edi,%eax
-       cmpl    $256,%eax
-       jae     L004unrolled
        jmp     L002loop
 .align 4,0x90
 L002loop:
@@ -148,7 +133,7 @@ L002loop:
        movl    %ecx,28(%esp)
        movl    %edi,32(%esp)
 .align 4,0x90
-L00500_15:
+L00300_15:
        movl    %edx,%ecx
        movl    24(%esp),%esi
        rorl    $14,%ecx
@@ -186,11 +171,11 @@ L00500_15:
        addl    $4,%ebp
        addl    %ebx,%eax
        cmpl    $3248222580,%esi
-       jne     L00500_15
+       jne     L00300_15
        movl    156(%esp),%ecx
-       jmp     L00616_63
+       jmp     L00416_63
 .align 4,0x90
-L00616_63:
+L00416_63:
        movl    %ecx,%ebx
        movl    104(%esp),%esi
        rorl    $11,%ecx
@@ -245,7 +230,7 @@ L00616_63:
        addl    $4,%ebp
        addl    %ebx,%eax
        cmpl    $3329325298,%esi
-       jne     L00616_63
+       jne     L00416_63
        movl    356(%esp),%esi
        movl    8(%esp),%ebx
        movl    16(%esp),%ecx
@@ -279,8 +264,8 @@ L00616_63:
        popl    %ebx
        popl    %ebp
        ret
-.align 4,0x90
-L003loop_shrd:
+.align 5,0x90
+L005loop_shrd:
        movl    (%edi),%eax
        movl    4(%edi),%ebx
        movl    8(%edi),%ecx
@@ -349,7 +334,7 @@ L003loop_shrd:
        movl    %ecx,28(%esp)
        movl    %edi,32(%esp)
 .align 4,0x90
-L00700_15_shrd:
+L00600_15_shrd:
        movl    %edx,%ecx
        movl    24(%esp),%esi
        shrdl   $14,%ecx,%ecx
@@ -387,11 +372,11 @@ L00700_15_shrd:
        addl    $4,%ebp
        addl    %ebx,%eax
        cmpl    $3248222580,%esi
-       jne     L00700_15_shrd
+       jne     L00600_15_shrd
        movl    156(%esp),%ecx
-       jmp     L00816_63_shrd
+       jmp     L00716_63_shrd
 .align 4,0x90
-L00816_63_shrd:
+L00716_63_shrd:
        movl    %ecx,%ebx
        movl    104(%esp),%esi
        shrdl   $11,%ecx,%ecx
@@ -446,7 +431,7 @@ L00816_63_shrd:
        addl    $4,%ebp
        addl    %ebx,%eax
        cmpl    $3329325298,%esi
-       jne     L00816_63_shrd
+       jne     L00716_63_shrd
        movl    356(%esp),%esi
        movl    8(%esp),%ebx
        movl    16(%esp),%ecx
@@ -473,7 +458,7 @@ L00816_63_shrd:
        leal    356(%esp),%esp
        subl    $256,%ebp
        cmpl    8(%esp),%edi
-       jb      L003loop_shrd
+       jb      L005loop_shrd
        movl    12(%esp),%esp
        popl    %edi
        popl    %esi
@@ -484,8 +469,13 @@ L00816_63_shrd:
 L001K256:
 .long  1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
 .long  66051,67438087,134810123,202182159
+.byte  83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+.byte  110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte  67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte  112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte  62,0
 .align 4,0x90
-L004unrolled:
+L008unrolled:
        leal    -96(%esp),%esp
        movl    (%esi),%eax
        movl    4(%esi),%ebp
@@ -3391,14 +3381,4 @@ L009grand_loop:
        popl    %ebx
        popl    %ebp
        ret
-.byte  83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
-.byte  110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
-.byte  67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
-.byte  112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
-.byte  62,0
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L__gnutls_x86_cpuid_s$non_lazy_ptr:
-.indirect_symbol       __gnutls_x86_cpuid_s
-.long  0
-.comm  __gnutls_x86_cpuid_s,16,2
 
diff --git a/lib/accelerated/x86/macosx/sha256-ssse3-x86_64.s b/lib/accelerated/x86/macosx/sha256-ssse3-x86_64.s
new file mode 100644 (file)
index 0000000..fd0c247
--- /dev/null
@@ -0,0 +1,5470 @@
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 
+#     * Redistributions of source code must retain copyright notices,
+#      this list of conditions and the following disclaimer.
+#
+#     * Redistributions in binary form must reproduce the above
+#      copyright notice, this list of conditions and the following
+#      disclaimer in the documentation and/or other materials
+#      provided with the distribution.
+#
+#     * Neither the name of the Andy Polyakov nor the names of its
+#      copyright holder and contributors may be used to endorse or
+#      promote products derived from this software without specific
+#      prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *** This file is auto-generated ***
+#
+.text  
+
+
+.globl _sha256_block_data_order
+
+.p2align       4
+_sha256_block_data_order:
+
+       leaq    __gnutls_x86_cpuid_s(%rip),%r11
+       movl    0(%r11),%r9d
+       movl    4(%r11),%r10d
+       movl    8(%r11),%r11d
+       testl   $536870912,%r11d
+       jnz     _shaext_shortcut
+       andl    $296,%r11d
+       cmpl    $296,%r11d
+       je      L$avx2_shortcut
+       andl    $1073741824,%r9d
+       andl    $268435968,%r10d
+       orl     %r9d,%r10d
+       cmpl    $1342177792,%r10d
+       je      L$avx_shortcut
+       testl   $512,%r10d
+       jnz     L$ssse3_shortcut
+       movq    %rsp,%rax
+
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       pushq   %r15
+
+       shlq    $4,%rdx
+       subq    $64+32,%rsp
+       leaq    (%rsi,%rdx,4),%rdx
+       andq    $-64,%rsp
+       movq    %rdi,64+0(%rsp)
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+       movq    %rax,88(%rsp)
+
+L$prologue:
+
+       movl    0(%rdi),%eax
+       movl    4(%rdi),%ebx
+       movl    8(%rdi),%ecx
+       movl    12(%rdi),%edx
+       movl    16(%rdi),%r8d
+       movl    20(%rdi),%r9d
+       movl    24(%rdi),%r10d
+       movl    28(%rdi),%r11d
+       jmp     L$loop
+
+.p2align       4
+L$loop:
+       movl    %ebx,%edi
+       leaq    K256(%rip),%rbp
+       xorl    %ecx,%edi
+       movl    0(%rsi),%r12d
+       movl    %r8d,%r13d
+       movl    %eax,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r9d,%r15d
+
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r15d
+
+       movl    %r12d,0(%rsp)
+       xorl    %eax,%r14d
+       andl    %r8d,%r15d
+
+       rorl    $5,%r13d
+       addl    %r11d,%r12d
+       xorl    %r10d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r8d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %eax,%r15d
+       addl    (%rbp),%r12d
+       xorl    %eax,%r14d
+
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       movl    %ebx,%r11d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r11d
+       addl    %r12d,%edx
+       addl    %r12d,%r11d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r11d
+       movl    4(%rsi),%r12d
+       movl    %edx,%r13d
+       movl    %r11d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r8d,%edi
+
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%edi
+
+       movl    %r12d,4(%rsp)
+       xorl    %r11d,%r14d
+       andl    %edx,%edi
+
+       rorl    $5,%r13d
+       addl    %r10d,%r12d
+       xorl    %r9d,%edi
+
+       rorl    $11,%r14d
+       xorl    %edx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r11d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r11d,%r14d
+
+       xorl    %eax,%edi
+       rorl    $6,%r13d
+       movl    %eax,%r10d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r10d
+       addl    %r12d,%ecx
+       addl    %r12d,%r10d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r10d
+       movl    8(%rsi),%r12d
+       movl    %ecx,%r13d
+       movl    %r10d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %edx,%r15d
+
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r15d
+
+       movl    %r12d,8(%rsp)
+       xorl    %r10d,%r14d
+       andl    %ecx,%r15d
+
+       rorl    $5,%r13d
+       addl    %r9d,%r12d
+       xorl    %r8d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %ecx,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r10d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r10d,%r14d
+
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       movl    %r11d,%r9d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r9d
+       addl    %r12d,%ebx
+       addl    %r12d,%r9d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r9d
+       movl    12(%rsi),%r12d
+       movl    %ebx,%r13d
+       movl    %r9d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %ecx,%edi
+
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%edi
+
+       movl    %r12d,12(%rsp)
+       xorl    %r9d,%r14d
+       andl    %ebx,%edi
+
+       rorl    $5,%r13d
+       addl    %r8d,%r12d
+       xorl    %edx,%edi
+
+       rorl    $11,%r14d
+       xorl    %ebx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r9d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r9d,%r14d
+
+       xorl    %r10d,%edi
+       rorl    $6,%r13d
+       movl    %r10d,%r8d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r8d
+       addl    %r12d,%eax
+       addl    %r12d,%r8d
+
+       leaq    20(%rbp),%rbp
+       addl    %r14d,%r8d
+       movl    16(%rsi),%r12d
+       movl    %eax,%r13d
+       movl    %r8d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %ebx,%r15d
+
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r15d
+
+       movl    %r12d,16(%rsp)
+       xorl    %r8d,%r14d
+       andl    %eax,%r15d
+
+       rorl    $5,%r13d
+       addl    %edx,%r12d
+       xorl    %ecx,%r15d
+
+       rorl    $11,%r14d
+       xorl    %eax,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r8d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r8d,%r14d
+
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       movl    %r9d,%edx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%edx
+       addl    %r12d,%r11d
+       addl    %r12d,%edx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%edx
+       movl    20(%rsi),%r12d
+       movl    %r11d,%r13d
+       movl    %edx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %eax,%edi
+
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%edi
+
+       movl    %r12d,20(%rsp)
+       xorl    %edx,%r14d
+       andl    %r11d,%edi
+
+       rorl    $5,%r13d
+       addl    %ecx,%r12d
+       xorl    %ebx,%edi
+
+       rorl    $11,%r14d
+       xorl    %r11d,%r13d
+       addl    %edi,%r12d
+
+       movl    %edx,%edi
+       addl    (%rbp),%r12d
+       xorl    %edx,%r14d
+
+       xorl    %r8d,%edi
+       rorl    $6,%r13d
+       movl    %r8d,%ecx
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%ecx
+       addl    %r12d,%r10d
+       addl    %r12d,%ecx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%ecx
+       movl    24(%rsi),%r12d
+       movl    %r10d,%r13d
+       movl    %ecx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r11d,%r15d
+
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r15d
+
+       movl    %r12d,24(%rsp)
+       xorl    %ecx,%r14d
+       andl    %r10d,%r15d
+
+       rorl    $5,%r13d
+       addl    %ebx,%r12d
+       xorl    %eax,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r10d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %ecx,%r15d
+       addl    (%rbp),%r12d
+       xorl    %ecx,%r14d
+
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       movl    %edx,%ebx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%ebx
+       addl    %r12d,%r9d
+       addl    %r12d,%ebx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%ebx
+       movl    28(%rsi),%r12d
+       movl    %r9d,%r13d
+       movl    %ebx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r10d,%edi
+
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%edi
+
+       movl    %r12d,28(%rsp)
+       xorl    %ebx,%r14d
+       andl    %r9d,%edi
+
+       rorl    $5,%r13d
+       addl    %eax,%r12d
+       xorl    %r11d,%edi
+
+       rorl    $11,%r14d
+       xorl    %r9d,%r13d
+       addl    %edi,%r12d
+
+       movl    %ebx,%edi
+       addl    (%rbp),%r12d
+       xorl    %ebx,%r14d
+
+       xorl    %ecx,%edi
+       rorl    $6,%r13d
+       movl    %ecx,%eax
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%eax
+       addl    %r12d,%r8d
+       addl    %r12d,%eax
+
+       leaq    20(%rbp),%rbp
+       addl    %r14d,%eax
+       movl    32(%rsi),%r12d
+       movl    %r8d,%r13d
+       movl    %eax,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r9d,%r15d
+
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r15d
+
+       movl    %r12d,32(%rsp)
+       xorl    %eax,%r14d
+       andl    %r8d,%r15d
+
+       rorl    $5,%r13d
+       addl    %r11d,%r12d
+       xorl    %r10d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r8d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %eax,%r15d
+       addl    (%rbp),%r12d
+       xorl    %eax,%r14d
+
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       movl    %ebx,%r11d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r11d
+       addl    %r12d,%edx
+       addl    %r12d,%r11d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r11d
+       movl    36(%rsi),%r12d
+       movl    %edx,%r13d
+       movl    %r11d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r8d,%edi
+
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%edi
+
+       movl    %r12d,36(%rsp)
+       xorl    %r11d,%r14d
+       andl    %edx,%edi
+
+       rorl    $5,%r13d
+       addl    %r10d,%r12d
+       xorl    %r9d,%edi
+
+       rorl    $11,%r14d
+       xorl    %edx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r11d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r11d,%r14d
+
+       xorl    %eax,%edi
+       rorl    $6,%r13d
+       movl    %eax,%r10d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r10d
+       addl    %r12d,%ecx
+       addl    %r12d,%r10d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r10d
+       movl    40(%rsi),%r12d
+       movl    %ecx,%r13d
+       movl    %r10d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %edx,%r15d
+
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r15d
+
+       movl    %r12d,40(%rsp)
+       xorl    %r10d,%r14d
+       andl    %ecx,%r15d
+
+       rorl    $5,%r13d
+       addl    %r9d,%r12d
+       xorl    %r8d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %ecx,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r10d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r10d,%r14d
+
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       movl    %r11d,%r9d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r9d
+       addl    %r12d,%ebx
+       addl    %r12d,%r9d
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r9d
+       movl    44(%rsi),%r12d
+       movl    %ebx,%r13d
+       movl    %r9d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %ecx,%edi
+
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%edi
+
+       movl    %r12d,44(%rsp)
+       xorl    %r9d,%r14d
+       andl    %ebx,%edi
+
+       rorl    $5,%r13d
+       addl    %r8d,%r12d
+       xorl    %edx,%edi
+
+       rorl    $11,%r14d
+       xorl    %ebx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r9d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r9d,%r14d
+
+       xorl    %r10d,%edi
+       rorl    $6,%r13d
+       movl    %r10d,%r8d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r8d
+       addl    %r12d,%eax
+       addl    %r12d,%r8d
+
+       leaq    20(%rbp),%rbp
+       addl    %r14d,%r8d
+       movl    48(%rsi),%r12d
+       movl    %eax,%r13d
+       movl    %r8d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %ebx,%r15d
+
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r15d
+
+       movl    %r12d,48(%rsp)
+       xorl    %r8d,%r14d
+       andl    %eax,%r15d
+
+       rorl    $5,%r13d
+       addl    %edx,%r12d
+       xorl    %ecx,%r15d
+
+       rorl    $11,%r14d
+       xorl    %eax,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r8d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r8d,%r14d
+
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       movl    %r9d,%edx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%edx
+       addl    %r12d,%r11d
+       addl    %r12d,%edx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%edx
+       movl    52(%rsi),%r12d
+       movl    %r11d,%r13d
+       movl    %edx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %eax,%edi
+
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%edi
+
+       movl    %r12d,52(%rsp)
+       xorl    %edx,%r14d
+       andl    %r11d,%edi
+
+       rorl    $5,%r13d
+       addl    %ecx,%r12d
+       xorl    %ebx,%edi
+
+       rorl    $11,%r14d
+       xorl    %r11d,%r13d
+       addl    %edi,%r12d
+
+       movl    %edx,%edi
+       addl    (%rbp),%r12d
+       xorl    %edx,%r14d
+
+       xorl    %r8d,%edi
+       rorl    $6,%r13d
+       movl    %r8d,%ecx
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%ecx
+       addl    %r12d,%r10d
+       addl    %r12d,%ecx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%ecx
+       movl    56(%rsi),%r12d
+       movl    %r10d,%r13d
+       movl    %ecx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r11d,%r15d
+
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r15d
+
+       movl    %r12d,56(%rsp)
+       xorl    %ecx,%r14d
+       andl    %r10d,%r15d
+
+       rorl    $5,%r13d
+       addl    %ebx,%r12d
+       xorl    %eax,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r10d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %ecx,%r15d
+       addl    (%rbp),%r12d
+       xorl    %ecx,%r14d
+
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       movl    %edx,%ebx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%ebx
+       addl    %r12d,%r9d
+       addl    %r12d,%ebx
+
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%ebx
+       movl    60(%rsi),%r12d
+       movl    %r9d,%r13d
+       movl    %ebx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r10d,%edi
+
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%edi
+
+       movl    %r12d,60(%rsp)
+       xorl    %ebx,%r14d
+       andl    %r9d,%edi
+
+       rorl    $5,%r13d
+       addl    %eax,%r12d
+       xorl    %r11d,%edi
+
+       rorl    $11,%r14d
+       xorl    %r9d,%r13d
+       addl    %edi,%r12d
+
+       movl    %ebx,%edi
+       addl    (%rbp),%r12d
+       xorl    %ebx,%r14d
+
+       xorl    %ecx,%edi
+       rorl    $6,%r13d
+       movl    %ecx,%eax
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%eax
+       addl    %r12d,%r8d
+       addl    %r12d,%eax
+
+       leaq    20(%rbp),%rbp
+       jmp     L$rounds_16_xx
+.p2align       4
+L$rounds_16_xx:
+       movl    4(%rsp),%r13d
+       movl    56(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%eax
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    36(%rsp),%r12d
+
+       addl    0(%rsp),%r12d
+       movl    %r8d,%r13d
+       addl    %r15d,%r12d
+       movl    %eax,%r14d
+       rorl    $14,%r13d
+       movl    %r9d,%r15d
+
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r15d
+
+       movl    %r12d,0(%rsp)
+       xorl    %eax,%r14d
+       andl    %r8d,%r15d
+
+       rorl    $5,%r13d
+       addl    %r11d,%r12d
+       xorl    %r10d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r8d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %eax,%r15d
+       addl    (%rbp),%r12d
+       xorl    %eax,%r14d
+
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       movl    %ebx,%r11d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r11d
+       addl    %r12d,%edx
+       addl    %r12d,%r11d
+
+       leaq    4(%rbp),%rbp
+       movl    8(%rsp),%r13d
+       movl    60(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r11d
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    40(%rsp),%r12d
+
+       addl    4(%rsp),%r12d
+       movl    %edx,%r13d
+       addl    %edi,%r12d
+       movl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r8d,%edi
+
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%edi
+
+       movl    %r12d,4(%rsp)
+       xorl    %r11d,%r14d
+       andl    %edx,%edi
+
+       rorl    $5,%r13d
+       addl    %r10d,%r12d
+       xorl    %r9d,%edi
+
+       rorl    $11,%r14d
+       xorl    %edx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r11d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r11d,%r14d
+
+       xorl    %eax,%edi
+       rorl    $6,%r13d
+       movl    %eax,%r10d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r10d
+       addl    %r12d,%ecx
+       addl    %r12d,%r10d
+
+       leaq    4(%rbp),%rbp
+       movl    12(%rsp),%r13d
+       movl    0(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r10d
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    44(%rsp),%r12d
+
+       addl    8(%rsp),%r12d
+       movl    %ecx,%r13d
+       addl    %r15d,%r12d
+       movl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %edx,%r15d
+
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r15d
+
+       movl    %r12d,8(%rsp)
+       xorl    %r10d,%r14d
+       andl    %ecx,%r15d
+
+       rorl    $5,%r13d
+       addl    %r9d,%r12d
+       xorl    %r8d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %ecx,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r10d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r10d,%r14d
+
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       movl    %r11d,%r9d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r9d
+       addl    %r12d,%ebx
+       addl    %r12d,%r9d
+
+       leaq    4(%rbp),%rbp
+       movl    16(%rsp),%r13d
+       movl    4(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r9d
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    48(%rsp),%r12d
+
+       addl    12(%rsp),%r12d
+       movl    %ebx,%r13d
+       addl    %edi,%r12d
+       movl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %ecx,%edi
+
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%edi
+
+       movl    %r12d,12(%rsp)
+       xorl    %r9d,%r14d
+       andl    %ebx,%edi
+
+       rorl    $5,%r13d
+       addl    %r8d,%r12d
+       xorl    %edx,%edi
+
+       rorl    $11,%r14d
+       xorl    %ebx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r9d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r9d,%r14d
+
+       xorl    %r10d,%edi
+       rorl    $6,%r13d
+       movl    %r10d,%r8d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r8d
+       addl    %r12d,%eax
+       addl    %r12d,%r8d
+
+       leaq    20(%rbp),%rbp
+       movl    20(%rsp),%r13d
+       movl    8(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r8d
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    52(%rsp),%r12d
+
+       addl    16(%rsp),%r12d
+       movl    %eax,%r13d
+       addl    %r15d,%r12d
+       movl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %ebx,%r15d
+
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r15d
+
+       movl    %r12d,16(%rsp)
+       xorl    %r8d,%r14d
+       andl    %eax,%r15d
+
+       rorl    $5,%r13d
+       addl    %edx,%r12d
+       xorl    %ecx,%r15d
+
+       rorl    $11,%r14d
+       xorl    %eax,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r8d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r8d,%r14d
+
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       movl    %r9d,%edx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%edx
+       addl    %r12d,%r11d
+       addl    %r12d,%edx
+
+       leaq    4(%rbp),%rbp
+       movl    24(%rsp),%r13d
+       movl    12(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%edx
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    56(%rsp),%r12d
+
+       addl    20(%rsp),%r12d
+       movl    %r11d,%r13d
+       addl    %edi,%r12d
+       movl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %eax,%edi
+
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%edi
+
+       movl    %r12d,20(%rsp)
+       xorl    %edx,%r14d
+       andl    %r11d,%edi
+
+       rorl    $5,%r13d
+       addl    %ecx,%r12d
+       xorl    %ebx,%edi
+
+       rorl    $11,%r14d
+       xorl    %r11d,%r13d
+       addl    %edi,%r12d
+
+       movl    %edx,%edi
+       addl    (%rbp),%r12d
+       xorl    %edx,%r14d
+
+       xorl    %r8d,%edi
+       rorl    $6,%r13d
+       movl    %r8d,%ecx
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%ecx
+       addl    %r12d,%r10d
+       addl    %r12d,%ecx
+
+       leaq    4(%rbp),%rbp
+       movl    28(%rsp),%r13d
+       movl    16(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%ecx
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    60(%rsp),%r12d
+
+       addl    24(%rsp),%r12d
+       movl    %r10d,%r13d
+       addl    %r15d,%r12d
+       movl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r11d,%r15d
+
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r15d
+
+       movl    %r12d,24(%rsp)
+       xorl    %ecx,%r14d
+       andl    %r10d,%r15d
+
+       rorl    $5,%r13d
+       addl    %ebx,%r12d
+       xorl    %eax,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r10d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %ecx,%r15d
+       addl    (%rbp),%r12d
+       xorl    %ecx,%r14d
+
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       movl    %edx,%ebx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%ebx
+       addl    %r12d,%r9d
+       addl    %r12d,%ebx
+
+       leaq    4(%rbp),%rbp
+       movl    32(%rsp),%r13d
+       movl    20(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%ebx
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    0(%rsp),%r12d
+
+       addl    28(%rsp),%r12d
+       movl    %r9d,%r13d
+       addl    %edi,%r12d
+       movl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r10d,%edi
+
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%edi
+
+       movl    %r12d,28(%rsp)
+       xorl    %ebx,%r14d
+       andl    %r9d,%edi
+
+       rorl    $5,%r13d
+       addl    %eax,%r12d
+       xorl    %r11d,%edi
+
+       rorl    $11,%r14d
+       xorl    %r9d,%r13d
+       addl    %edi,%r12d
+
+       movl    %ebx,%edi
+       addl    (%rbp),%r12d
+       xorl    %ebx,%r14d
+
+       xorl    %ecx,%edi
+       rorl    $6,%r13d
+       movl    %ecx,%eax
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%eax
+       addl    %r12d,%r8d
+       addl    %r12d,%eax
+
+       leaq    20(%rbp),%rbp
+       movl    36(%rsp),%r13d
+       movl    24(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%eax
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    4(%rsp),%r12d
+
+       addl    32(%rsp),%r12d
+       movl    %r8d,%r13d
+       addl    %r15d,%r12d
+       movl    %eax,%r14d
+       rorl    $14,%r13d
+       movl    %r9d,%r15d
+
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r15d
+
+       movl    %r12d,32(%rsp)
+       xorl    %eax,%r14d
+       andl    %r8d,%r15d
+
+       rorl    $5,%r13d
+       addl    %r11d,%r12d
+       xorl    %r10d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r8d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %eax,%r15d
+       addl    (%rbp),%r12d
+       xorl    %eax,%r14d
+
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       movl    %ebx,%r11d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r11d
+       addl    %r12d,%edx
+       addl    %r12d,%r11d
+
+       leaq    4(%rbp),%rbp
+       movl    40(%rsp),%r13d
+       movl    28(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r11d
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    8(%rsp),%r12d
+
+       addl    36(%rsp),%r12d
+       movl    %edx,%r13d
+       addl    %edi,%r12d
+       movl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r8d,%edi
+
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%edi
+
+       movl    %r12d,36(%rsp)
+       xorl    %r11d,%r14d
+       andl    %edx,%edi
+
+       rorl    $5,%r13d
+       addl    %r10d,%r12d
+       xorl    %r9d,%edi
+
+       rorl    $11,%r14d
+       xorl    %edx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r11d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r11d,%r14d
+
+       xorl    %eax,%edi
+       rorl    $6,%r13d
+       movl    %eax,%r10d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r10d
+       addl    %r12d,%ecx
+       addl    %r12d,%r10d
+
+       leaq    4(%rbp),%rbp
+       movl    44(%rsp),%r13d
+       movl    32(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r10d
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    12(%rsp),%r12d
+
+       addl    40(%rsp),%r12d
+       movl    %ecx,%r13d
+       addl    %r15d,%r12d
+       movl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %edx,%r15d
+
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r15d
+
+       movl    %r12d,40(%rsp)
+       xorl    %r10d,%r14d
+       andl    %ecx,%r15d
+
+       rorl    $5,%r13d
+       addl    %r9d,%r12d
+       xorl    %r8d,%r15d
+
+       rorl    $11,%r14d
+       xorl    %ecx,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r10d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r10d,%r14d
+
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       movl    %r11d,%r9d
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%r9d
+       addl    %r12d,%ebx
+       addl    %r12d,%r9d
+
+       leaq    4(%rbp),%rbp
+       movl    48(%rsp),%r13d
+       movl    36(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r9d
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    16(%rsp),%r12d
+
+       addl    44(%rsp),%r12d
+       movl    %ebx,%r13d
+       addl    %edi,%r12d
+       movl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %ecx,%edi
+
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%edi
+
+       movl    %r12d,44(%rsp)
+       xorl    %r9d,%r14d
+       andl    %ebx,%edi
+
+       rorl    $5,%r13d
+       addl    %r8d,%r12d
+       xorl    %edx,%edi
+
+       rorl    $11,%r14d
+       xorl    %ebx,%r13d
+       addl    %edi,%r12d
+
+       movl    %r9d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r9d,%r14d
+
+       xorl    %r10d,%edi
+       rorl    $6,%r13d
+       movl    %r10d,%r8d
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%r8d
+       addl    %r12d,%eax
+       addl    %r12d,%r8d
+
+       leaq    20(%rbp),%rbp
+       movl    52(%rsp),%r13d
+       movl    40(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r8d
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    20(%rsp),%r12d
+
+       addl    48(%rsp),%r12d
+       movl    %eax,%r13d
+       addl    %r15d,%r12d
+       movl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %ebx,%r15d
+
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r15d
+
+       movl    %r12d,48(%rsp)
+       xorl    %r8d,%r14d
+       andl    %eax,%r15d
+
+       rorl    $5,%r13d
+       addl    %edx,%r12d
+       xorl    %ecx,%r15d
+
+       rorl    $11,%r14d
+       xorl    %eax,%r13d
+       addl    %r15d,%r12d
+
+       movl    %r8d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r8d,%r14d
+
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       movl    %r9d,%edx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%edx
+       addl    %r12d,%r11d
+       addl    %r12d,%edx
+
+       leaq    4(%rbp),%rbp
+       movl    56(%rsp),%r13d
+       movl    44(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%edx
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    24(%rsp),%r12d
+
+       addl    52(%rsp),%r12d
+       movl    %r11d,%r13d
+       addl    %edi,%r12d
+       movl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %eax,%edi
+
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%edi
+
+       movl    %r12d,52(%rsp)
+       xorl    %edx,%r14d
+       andl    %r11d,%edi
+
+       rorl    $5,%r13d
+       addl    %ecx,%r12d
+       xorl    %ebx,%edi
+
+       rorl    $11,%r14d
+       xorl    %r11d,%r13d
+       addl    %edi,%r12d
+
+       movl    %edx,%edi
+       addl    (%rbp),%r12d
+       xorl    %edx,%r14d
+
+       xorl    %r8d,%edi
+       rorl    $6,%r13d
+       movl    %r8d,%ecx
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%ecx
+       addl    %r12d,%r10d
+       addl    %r12d,%ecx
+
+       leaq    4(%rbp),%rbp
+       movl    60(%rsp),%r13d
+       movl    48(%rsp),%r15d
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%ecx
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    28(%rsp),%r12d
+
+       addl    56(%rsp),%r12d
+       movl    %r10d,%r13d
+       addl    %r15d,%r12d
+       movl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r11d,%r15d
+
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r15d
+
+       movl    %r12d,56(%rsp)
+       xorl    %ecx,%r14d
+       andl    %r10d,%r15d
+
+       rorl    $5,%r13d
+       addl    %ebx,%r12d
+       xorl    %eax,%r15d
+
+       rorl    $11,%r14d
+       xorl    %r10d,%r13d
+       addl    %r15d,%r12d
+
+       movl    %ecx,%r15d
+       addl    (%rbp),%r12d
+       xorl    %ecx,%r14d
+
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       movl    %edx,%ebx
+
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %edi,%ebx
+       addl    %r12d,%r9d
+       addl    %r12d,%ebx
+
+       leaq    4(%rbp),%rbp
+       movl    0(%rsp),%r13d
+       movl    52(%rsp),%edi
+
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%ebx
+       movl    %edi,%r14d
+       rorl    $2,%edi
+
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    32(%rsp),%r12d
+
+       addl    60(%rsp),%r12d
+       movl    %r9d,%r13d
+       addl    %edi,%r12d
+       movl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r10d,%edi
+
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%edi
+
+       movl    %r12d,60(%rsp)
+       xorl    %ebx,%r14d
+       andl    %r9d,%edi
+
+       rorl    $5,%r13d
+       addl    %eax,%r12d
+       xorl    %r11d,%edi
+
+       rorl    $11,%r14d
+       xorl    %r9d,%r13d
+       addl    %edi,%r12d
+
+       movl    %ebx,%edi
+       addl    (%rbp),%r12d
+       xorl    %ebx,%r14d
+
+       xorl    %ecx,%edi
+       rorl    $6,%r13d
+       movl    %ecx,%eax
+
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+
+       xorl    %r15d,%eax
+       addl    %r12d,%r8d
+       addl    %r12d,%eax
+
+       leaq    20(%rbp),%rbp
+       cmpb    $0,3(%rbp)
+       jnz     L$rounds_16_xx
+
+       movq    64+0(%rsp),%rdi
+       addl    %r14d,%eax
+       leaq    64(%rsi),%rsi
+
+       addl    0(%rdi),%eax
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       addl    24(%rdi),%r10d
+       addl    28(%rdi),%r11d
+
+       cmpq    64+16(%rsp),%rsi
+
+       movl    %eax,0(%rdi)
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+       jb      L$loop
+
+       movq    88(%rsp),%rsi
+
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
+L$epilogue:
+       .byte   0xf3,0xc3
+
+
+.p2align       6
+
+K256:
+.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long  0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long  0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long  0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long  0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte  83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+
+.p2align       6
+sha256_block_data_order_shaext:
+_shaext_shortcut:
+       leaq    K256+128(%rip),%rcx
+       movdqu  (%rdi),%xmm1
+       movdqu  16(%rdi),%xmm2
+       movdqa  512-128(%rcx),%xmm7
+
+       pshufd  $0x1b,%xmm1,%xmm0
+       pshufd  $0xb1,%xmm1,%xmm1
+       pshufd  $0x1b,%xmm2,%xmm2
+       movdqa  %xmm7,%xmm8
+.byte  102,15,58,15,202,8
+       punpcklqdq      %xmm0,%xmm2
+       jmp     L$oop_shaext
+
+.p2align       4
+L$oop_shaext:
+       movdqu  (%rsi),%xmm3
+       movdqu  16(%rsi),%xmm4
+       movdqu  32(%rsi),%xmm5
+.byte  102,15,56,0,223
+       movdqu  48(%rsi),%xmm6
+
+       movdqa  0-128(%rcx),%xmm0
+       paddd   %xmm3,%xmm0
+.byte  102,15,56,0,231
+       movdqa  %xmm2,%xmm10
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       nop
+       movdqa  %xmm1,%xmm9
+.byte  15,56,203,202
+
+       movdqa  32-128(%rcx),%xmm0
+       paddd   %xmm4,%xmm0
+.byte  102,15,56,0,239
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       leaq    64(%rsi),%rsi
+.byte  15,56,204,220
+.byte  15,56,203,202
+
+       movdqa  64-128(%rcx),%xmm0
+       paddd   %xmm5,%xmm0
+.byte  102,15,56,0,247
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm6,%xmm7
+.byte  102,15,58,15,253,4
+       nop
+       paddd   %xmm7,%xmm3
+.byte  15,56,204,229
+.byte  15,56,203,202
+
+       movdqa  96-128(%rcx),%xmm0
+       paddd   %xmm6,%xmm0
+.byte  15,56,205,222
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm3,%xmm7
+.byte  102,15,58,15,254,4
+       nop
+       paddd   %xmm7,%xmm4
+.byte  15,56,204,238
+.byte  15,56,203,202
+       movdqa  128-128(%rcx),%xmm0
+       paddd   %xmm3,%xmm0
+.byte  15,56,205,227
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm4,%xmm7
+.byte  102,15,58,15,251,4
+       nop
+       paddd   %xmm7,%xmm5
+.byte  15,56,204,243
+.byte  15,56,203,202
+       movdqa  160-128(%rcx),%xmm0
+       paddd   %xmm4,%xmm0
+.byte  15,56,205,236
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm5,%xmm7
+.byte  102,15,58,15,252,4
+       nop
+       paddd   %xmm7,%xmm6
+.byte  15,56,204,220
+.byte  15,56,203,202
+       movdqa  192-128(%rcx),%xmm0
+       paddd   %xmm5,%xmm0
+.byte  15,56,205,245
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm6,%xmm7
+.byte  102,15,58,15,253,4
+       nop
+       paddd   %xmm7,%xmm3
+.byte  15,56,204,229
+.byte  15,56,203,202
+       movdqa  224-128(%rcx),%xmm0
+       paddd   %xmm6,%xmm0
+.byte  15,56,205,222
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm3,%xmm7
+.byte  102,15,58,15,254,4
+       nop
+       paddd   %xmm7,%xmm4
+.byte  15,56,204,238
+.byte  15,56,203,202
+       movdqa  256-128(%rcx),%xmm0
+       paddd   %xmm3,%xmm0
+.byte  15,56,205,227
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm4,%xmm7
+.byte  102,15,58,15,251,4
+       nop
+       paddd   %xmm7,%xmm5
+.byte  15,56,204,243
+.byte  15,56,203,202
+       movdqa  288-128(%rcx),%xmm0
+       paddd   %xmm4,%xmm0
+.byte  15,56,205,236
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm5,%xmm7
+.byte  102,15,58,15,252,4
+       nop
+       paddd   %xmm7,%xmm6
+.byte  15,56,204,220
+.byte  15,56,203,202
+       movdqa  320-128(%rcx),%xmm0
+       paddd   %xmm5,%xmm0
+.byte  15,56,205,245
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm6,%xmm7
+.byte  102,15,58,15,253,4
+       nop
+       paddd   %xmm7,%xmm3
+.byte  15,56,204,229
+.byte  15,56,203,202
+       movdqa  352-128(%rcx),%xmm0
+       paddd   %xmm6,%xmm0
+.byte  15,56,205,222
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm3,%xmm7
+.byte  102,15,58,15,254,4
+       nop
+       paddd   %xmm7,%xmm4
+.byte  15,56,204,238
+.byte  15,56,203,202
+       movdqa  384-128(%rcx),%xmm0
+       paddd   %xmm3,%xmm0
+.byte  15,56,205,227
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm4,%xmm7
+.byte  102,15,58,15,251,4
+       nop
+       paddd   %xmm7,%xmm5
+.byte  15,56,204,243
+.byte  15,56,203,202
+       movdqa  416-128(%rcx),%xmm0
+       paddd   %xmm4,%xmm0
+.byte  15,56,205,236
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm5,%xmm7
+.byte  102,15,58,15,252,4
+.byte  15,56,203,202
+       paddd   %xmm7,%xmm6
+
+       movdqa  448-128(%rcx),%xmm0
+       paddd   %xmm5,%xmm0
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+.byte  15,56,205,245
+       movdqa  %xmm8,%xmm7
+.byte  15,56,203,202
+
+       movdqa  480-128(%rcx),%xmm0
+       paddd   %xmm6,%xmm0
+       nop
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       decq    %rdx
+       nop
+.byte  15,56,203,202
+
+       paddd   %xmm10,%xmm2
+       paddd   %xmm9,%xmm1
+       jnz     L$oop_shaext
+
+       pshufd  $0xb1,%xmm2,%xmm2
+       pshufd  $0x1b,%xmm1,%xmm7
+       pshufd  $0xb1,%xmm1,%xmm1
+       punpckhqdq      %xmm2,%xmm1
+.byte  102,15,58,15,215,8
+
+       movdqu  %xmm1,(%rdi)
+       movdqu  %xmm2,16(%rdi)
+       .byte   0xf3,0xc3
+
+
+.p2align       6
+sha256_block_data_order_ssse3:
+
+L$ssse3_shortcut:
+       movq    %rsp,%rax
+
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       pushq   %r15
+
+       shlq    $4,%rdx
+       subq    $96,%rsp
+       leaq    (%rsi,%rdx,4),%rdx
+       andq    $-64,%rsp
+       movq    %rdi,64+0(%rsp)
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+       movq    %rax,88(%rsp)
+
+L$prologue_ssse3:
+
+       movl    0(%rdi),%eax
+       movl    4(%rdi),%ebx
+       movl    8(%rdi),%ecx
+       movl    12(%rdi),%edx
+       movl    16(%rdi),%r8d
+       movl    20(%rdi),%r9d
+       movl    24(%rdi),%r10d
+       movl    28(%rdi),%r11d
+
+
+       jmp     L$loop_ssse3
+.p2align       4
+L$loop_ssse3:
+       movdqa  K256+512(%rip),%xmm7
+       movdqu  0(%rsi),%xmm0
+       movdqu  16(%rsi),%xmm1
+       movdqu  32(%rsi),%xmm2
+.byte  102,15,56,0,199
+       movdqu  48(%rsi),%xmm3
+       leaq    K256(%rip),%rbp
+.byte  102,15,56,0,207
+       movdqa  0(%rbp),%xmm4
+       movdqa  32(%rbp),%xmm5
+.byte  102,15,56,0,215
+       paddd   %xmm0,%xmm4
+       movdqa  64(%rbp),%xmm6
+.byte  102,15,56,0,223
+       movdqa  96(%rbp),%xmm7
+       paddd   %xmm1,%xmm5
+       paddd   %xmm2,%xmm6
+       paddd   %xmm3,%xmm7
+       movdqa  %xmm4,0(%rsp)
+       movl    %eax,%r14d
+       movdqa  %xmm5,16(%rsp)
+       movl    %ebx,%edi
+       movdqa  %xmm6,32(%rsp)
+       xorl    %ecx,%edi
+       movdqa  %xmm7,48(%rsp)
+       movl    %r8d,%r13d
+       jmp     L$ssse3_00_47
+
+.p2align       4
+L$ssse3_00_47:
+       subq    $-128,%rbp
+       rorl    $14,%r13d
+       movdqa  %xmm1,%xmm4
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       movdqa  %xmm3,%xmm7
+       rorl    $9,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+.byte  102,15,58,15,224,4
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+.byte  102,15,58,15,250,4
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm4,%xmm5
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       movdqa  %xmm4,%xmm6
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       psrld   $3,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       paddd   %xmm7,%xmm0
+       rorl    $2,%r14d
+       addl    %r11d,%edx
+       psrld   $7,%xmm6
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       pshufd  $250,%xmm3,%xmm7
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       pslld   $14,%xmm5
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       pxor    %xmm6,%xmm4
+       rorl    $9,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       psrld   $11,%xmm6
+       xorl    %r11d,%r14d
+       pxor    %xmm5,%xmm4
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       pslld   $11,%xmm5
+       addl    4(%rsp),%r10d
+       movl    %r11d,%edi
+       pxor    %xmm6,%xmm4
+       xorl    %r9d,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm7,%xmm6
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       pxor    %xmm5,%xmm4
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       psrld   $10,%xmm7
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       paddd   %xmm4,%xmm0
+       rorl    $2,%r14d
+       addl    %r10d,%ecx
+       psrlq   $17,%xmm6
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       pxor    %xmm6,%xmm7
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       rorl    $9,%r14d
+       psrlq   $2,%xmm6
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       pshufd  $128,%xmm7,%xmm7
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       movl    %r10d,%r15d
+       psrldq  $8,%xmm7
+       xorl    %r8d,%r12d
+       rorl    $11,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       rorl    $6,%r13d
+       paddd   %xmm7,%xmm0
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       pshufd  $80,%xmm0,%xmm7
+       xorl    %r11d,%edi
+       rorl    $2,%r14d
+       addl    %r9d,%ebx
+       movdqa  %xmm7,%xmm6
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       psrld   $10,%xmm7
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       psrlq   $17,%xmm6
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $9,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       psrlq   $2,%xmm6
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       pxor    %xmm6,%xmm7
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       rorl    $11,%r14d
+       pshufd  $8,%xmm7,%xmm7
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       movdqa  0(%rbp),%xmm6
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       pslldq  $8,%xmm7
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       paddd   %xmm7,%xmm0
+       rorl    $2,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       paddd   %xmm0,%xmm6
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       movdqa  %xmm6,0(%rsp)
+       rorl    $14,%r13d
+       movdqa  %xmm2,%xmm4
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       movdqa  %xmm0,%xmm7
+       rorl    $9,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+.byte  102,15,58,15,225,4
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+.byte  102,15,58,15,251,4
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm4,%xmm5
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       movdqa  %xmm4,%xmm6
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       psrld   $3,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       paddd   %xmm7,%xmm1
+       rorl    $2,%r14d
+       addl    %edx,%r11d
+       psrld   $7,%xmm6
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       pshufd  $250,%xmm0,%xmm7
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       pslld   $14,%xmm5
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       pxor    %xmm6,%xmm4
+       rorl    $9,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       psrld   $11,%xmm6
+       xorl    %edx,%r14d
+       pxor    %xmm5,%xmm4
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       pslld   $11,%xmm5
+       addl    20(%rsp),%ecx
+       movl    %edx,%edi
+       pxor    %xmm6,%xmm4
+       xorl    %ebx,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm7,%xmm6
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       pxor    %xmm5,%xmm4
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       psrld   $10,%xmm7
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       paddd   %xmm4,%xmm1
+       rorl    $2,%r14d
+       addl    %ecx,%r10d
+       psrlq   $17,%xmm6
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       pxor    %xmm6,%xmm7
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       rorl    $9,%r14d
+       psrlq   $2,%xmm6
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       pshufd  $128,%xmm7,%xmm7
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       movl    %ecx,%r15d
+       psrldq  $8,%xmm7
+       xorl    %eax,%r12d
+       rorl    $11,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       rorl    $6,%r13d
+       paddd   %xmm7,%xmm1
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       pshufd  $80,%xmm1,%xmm7
+       xorl    %edx,%edi
+       rorl    $2,%r14d
+       addl    %ebx,%r9d
+       movdqa  %xmm7,%xmm6
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       psrld   $10,%xmm7
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       psrlq   $17,%xmm6
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $9,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       psrlq   $2,%xmm6
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       pxor    %xmm6,%xmm7
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       rorl    $11,%r14d
+       pshufd  $8,%xmm7,%xmm7
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       movdqa  32(%rbp),%xmm6
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       pslldq  $8,%xmm7
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       paddd   %xmm7,%xmm1
+       rorl    $2,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       paddd   %xmm1,%xmm6
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movdqa  %xmm6,16(%rsp)
+       rorl    $14,%r13d
+       movdqa  %xmm3,%xmm4
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       movdqa  %xmm1,%xmm7
+       rorl    $9,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+.byte  102,15,58,15,226,4
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+.byte  102,15,58,15,248,4
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm4,%xmm5
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       movdqa  %xmm4,%xmm6
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       psrld   $3,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       paddd   %xmm7,%xmm2
+       rorl    $2,%r14d
+       addl    %r11d,%edx
+       psrld   $7,%xmm6
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       pshufd  $250,%xmm1,%xmm7
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       pslld   $14,%xmm5
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       pxor    %xmm6,%xmm4
+       rorl    $9,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       psrld   $11,%xmm6
+       xorl    %r11d,%r14d
+       pxor    %xmm5,%xmm4
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       pslld   $11,%xmm5
+       addl    36(%rsp),%r10d
+       movl    %r11d,%edi
+       pxor    %xmm6,%xmm4
+       xorl    %r9d,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm7,%xmm6
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       pxor    %xmm5,%xmm4
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       psrld   $10,%xmm7
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       paddd   %xmm4,%xmm2
+       rorl    $2,%r14d
+       addl    %r10d,%ecx
+       psrlq   $17,%xmm6
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       pxor    %xmm6,%xmm7
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       rorl    $9,%r14d
+       psrlq   $2,%xmm6
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       pshufd  $128,%xmm7,%xmm7
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       movl    %r10d,%r15d
+       psrldq  $8,%xmm7
+       xorl    %r8d,%r12d
+       rorl    $11,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       rorl    $6,%r13d
+       paddd   %xmm7,%xmm2
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       pshufd  $80,%xmm2,%xmm7
+       xorl    %r11d,%edi
+       rorl    $2,%r14d
+       addl    %r9d,%ebx
+       movdqa  %xmm7,%xmm6
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       psrld   $10,%xmm7
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       psrlq   $17,%xmm6
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $9,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       psrlq   $2,%xmm6
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       pxor    %xmm6,%xmm7
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       rorl    $11,%r14d
+       pshufd  $8,%xmm7,%xmm7
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       movdqa  64(%rbp),%xmm6
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       pslldq  $8,%xmm7
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       paddd   %xmm7,%xmm2
+       rorl    $2,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       paddd   %xmm2,%xmm6
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       movdqa  %xmm6,32(%rsp)
+       rorl    $14,%r13d
+       movdqa  %xmm0,%xmm4
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       movdqa  %xmm2,%xmm7
+       rorl    $9,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+.byte  102,15,58,15,227,4
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+.byte  102,15,58,15,249,4
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm4,%xmm5
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       movdqa  %xmm4,%xmm6
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       psrld   $3,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       paddd   %xmm7,%xmm3
+       rorl    $2,%r14d
+       addl    %edx,%r11d
+       psrld   $7,%xmm6
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       pshufd  $250,%xmm2,%xmm7
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       pslld   $14,%xmm5
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       pxor    %xmm6,%xmm4
+       rorl    $9,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       psrld   $11,%xmm6
+       xorl    %edx,%r14d
+       pxor    %xmm5,%xmm4
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       pslld   $11,%xmm5
+       addl    52(%rsp),%ecx
+       movl    %edx,%edi
+       pxor    %xmm6,%xmm4
+       xorl    %ebx,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm7,%xmm6
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       pxor    %xmm5,%xmm4
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       psrld   $10,%xmm7
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       paddd   %xmm4,%xmm3
+       rorl    $2,%r14d
+       addl    %ecx,%r10d
+       psrlq   $17,%xmm6
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       pxor    %xmm6,%xmm7
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       rorl    $9,%r14d
+       psrlq   $2,%xmm6
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       pshufd  $128,%xmm7,%xmm7
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       movl    %ecx,%r15d
+       psrldq  $8,%xmm7
+       xorl    %eax,%r12d
+       rorl    $11,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       rorl    $6,%r13d
+       paddd   %xmm7,%xmm3
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       pshufd  $80,%xmm3,%xmm7
+       xorl    %edx,%edi
+       rorl    $2,%r14d
+       addl    %ebx,%r9d
+       movdqa  %xmm7,%xmm6
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       psrld   $10,%xmm7
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       psrlq   $17,%xmm6
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $9,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       psrlq   $2,%xmm6
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       pxor    %xmm6,%xmm7
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       rorl    $11,%r14d
+       pshufd  $8,%xmm7,%xmm7
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       movdqa  96(%rbp),%xmm6
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       pslldq  $8,%xmm7
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       paddd   %xmm7,%xmm3
+       rorl    $2,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       paddd   %xmm3,%xmm6
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movdqa  %xmm6,48(%rsp)
+       cmpb    $0,131(%rbp)
+       jne     L$ssse3_00_47
+       rorl    $14,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       rorl    $9,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       rorl    $11,%r14d
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       rorl    $2,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       rorl    $9,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       addl    4(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       rorl    $11,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       rorl    $2,%r14d
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       rorl    $9,%r14d
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       rorl    $11,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%edi
+       rorl    $2,%r14d
+       addl    %r9d,%ebx
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       rorl    $9,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       rorl    $11,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       rorl    $2,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       rorl    $9,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       rorl    $11,%r14d
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       rorl    $2,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       rorl    $9,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       addl    20(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       rorl    $11,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       rorl    $2,%r14d
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       rorl    $9,%r14d
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       rorl    $11,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%edi
+       rorl    $2,%r14d
+       addl    %ebx,%r9d
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       rorl    $9,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       rorl    $11,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       rorl    $2,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       rorl    $9,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       rorl    $11,%r14d
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       rorl    $2,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       rorl    $9,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       addl    36(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       rorl    $11,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       rorl    $2,%r14d
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       rorl    $9,%r14d
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       rorl    $11,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%edi
+       rorl    $2,%r14d
+       addl    %r9d,%ebx
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       rorl    $9,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       rorl    $11,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       rorl    $2,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       rorl    $9,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       rorl    $11,%r14d
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       rorl    $2,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       rorl    $9,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       addl    52(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       rorl    $11,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       rorl    $2,%r14d
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       rorl    $9,%r14d
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       rorl    $11,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%edi
+       rorl    $2,%r14d
+       addl    %ebx,%r9d
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       rorl    $9,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       rorl    $11,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       rorl    $2,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movq    64+0(%rsp),%rdi
+       movl    %r14d,%eax
+
+       addl    0(%rdi),%eax
+       leaq    64(%rsi),%rsi
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       addl    24(%rdi),%r10d
+       addl    28(%rdi),%r11d
+
+       cmpq    64+16(%rsp),%rsi
+
+       movl    %eax,0(%rdi)
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+       jb      L$loop_ssse3
+
+       movq    88(%rsp),%rsi
+
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
+L$epilogue_ssse3:
+       .byte   0xf3,0xc3
+
+
+
+.p2align       6
+sha256_block_data_order_avx:
+
+L$avx_shortcut:
+       movq    %rsp,%rax
+
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       pushq   %r15
+
+       shlq    $4,%rdx
+       subq    $96,%rsp
+       leaq    (%rsi,%rdx,4),%rdx
+       andq    $-64,%rsp
+       movq    %rdi,64+0(%rsp)
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+       movq    %rax,88(%rsp)
+
+L$prologue_avx:
+
+       vzeroupper
+       movl    0(%rdi),%eax
+       movl    4(%rdi),%ebx
+       movl    8(%rdi),%ecx
+       movl    12(%rdi),%edx
+       movl    16(%rdi),%r8d
+       movl    20(%rdi),%r9d
+       movl    24(%rdi),%r10d
+       movl    28(%rdi),%r11d
+       vmovdqa K256+512+32(%rip),%xmm8
+       vmovdqa K256+512+64(%rip),%xmm9
+       jmp     L$loop_avx
+.p2align       4
+L$loop_avx:
+       vmovdqa K256+512(%rip),%xmm7
+       vmovdqu 0(%rsi),%xmm0
+       vmovdqu 16(%rsi),%xmm1
+       vmovdqu 32(%rsi),%xmm2
+       vmovdqu 48(%rsi),%xmm3
+       vpshufb %xmm7,%xmm0,%xmm0
+       leaq    K256(%rip),%rbp
+       vpshufb %xmm7,%xmm1,%xmm1
+       vpshufb %xmm7,%xmm2,%xmm2
+       vpaddd  0(%rbp),%xmm0,%xmm4
+       vpshufb %xmm7,%xmm3,%xmm3
+       vpaddd  32(%rbp),%xmm1,%xmm5
+       vpaddd  64(%rbp),%xmm2,%xmm6
+       vpaddd  96(%rbp),%xmm3,%xmm7
+       vmovdqa %xmm4,0(%rsp)
+       movl    %eax,%r14d
+       vmovdqa %xmm5,16(%rsp)
+       movl    %ebx,%edi
+       vmovdqa %xmm6,32(%rsp)
+       xorl    %ecx,%edi
+       vmovdqa %xmm7,48(%rsp)
+       movl    %r8d,%r13d
+       jmp     L$avx_00_47
+
+.p2align       4
+L$avx_00_47:
+       subq    $-128,%rbp
+       vpalignr        $4,%xmm0,%xmm1,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       vpalignr        $4,%xmm2,%xmm3,%xmm7
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       vpaddd  %xmm7,%xmm0,%xmm0
+       xorl    %r8d,%r13d
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       xorl    %r10d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       addl    %r12d,%r11d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       vpshufd $250,%xmm3,%xmm7
+       shrdl   $2,%r14d,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       shrdl   $9,%r14d,%r14d
+       vpslld  $11,%xmm5,%xmm5
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    4(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       vpsrlq  $17,%xmm7,%xmm7
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       vpaddd  %xmm4,%xmm0,%xmm0
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %edx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r13d
+       vpshufb %xmm8,%xmm6,%xmm6
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       vpaddd  %xmm6,%xmm0,%xmm0
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       vpshufd $80,%xmm0,%xmm7
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       shrdl   $11,%r14d,%r14d
+       vpsrld  $10,%xmm7,%xmm6
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %r11d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r9d,%ebx
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       vpshufb %xmm9,%xmm6,%xmm6
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       vpaddd  %xmm6,%xmm0,%xmm0
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       vpaddd  0(%rbp),%xmm0,%xmm6
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       vmovdqa %xmm6,0(%rsp)
+       vpalignr        $4,%xmm1,%xmm2,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       vpalignr        $4,%xmm3,%xmm0,%xmm7
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       vpaddd  %xmm7,%xmm1,%xmm1
+       xorl    %eax,%r13d
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       xorl    %ecx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       addl    %r12d,%edx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       vpshufd $250,%xmm0,%xmm7
+       shrdl   $2,%r14d,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       shrdl   $9,%r14d,%r14d
+       vpslld  $11,%xmm5,%xmm5
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    20(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       vpsrlq  $17,%xmm7,%xmm7
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       vpaddd  %xmm4,%xmm1,%xmm1
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %r11d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r13d
+       vpshufb %xmm8,%xmm6,%xmm6
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       vpaddd  %xmm6,%xmm1,%xmm1
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       vpshufd $80,%xmm1,%xmm7
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       shrdl   $11,%r14d,%r14d
+       vpsrld  $10,%xmm7,%xmm6
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %edx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %ebx,%r9d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       vpshufb %xmm9,%xmm6,%xmm6
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       vpaddd  %xmm6,%xmm1,%xmm1
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       vpaddd  32(%rbp),%xmm1,%xmm6
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       vmovdqa %xmm6,16(%rsp)
+       vpalignr        $4,%xmm2,%xmm3,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       vpalignr        $4,%xmm0,%xmm1,%xmm7
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       vpaddd  %xmm7,%xmm2,%xmm2
+       xorl    %r8d,%r13d
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       xorl    %r10d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       addl    %r12d,%r11d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       vpshufd $250,%xmm1,%xmm7
+       shrdl   $2,%r14d,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       shrdl   $9,%r14d,%r14d
+       vpslld  $11,%xmm5,%xmm5
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    36(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       vpsrlq  $17,%xmm7,%xmm7
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       vpaddd  %xmm4,%xmm2,%xmm2
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %edx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r13d
+       vpshufb %xmm8,%xmm6,%xmm6
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       vpaddd  %xmm6,%xmm2,%xmm2
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       vpshufd $80,%xmm2,%xmm7
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       shrdl   $11,%r14d,%r14d
+       vpsrld  $10,%xmm7,%xmm6
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %r11d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r9d,%ebx
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       vpshufb %xmm9,%xmm6,%xmm6
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       vpaddd  %xmm6,%xmm2,%xmm2
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       vpaddd  64(%rbp),%xmm2,%xmm6
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       vmovdqa %xmm6,32(%rsp)
+       vpalignr        $4,%xmm3,%xmm0,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       vpalignr        $4,%xmm1,%xmm2,%xmm7
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       vpaddd  %xmm7,%xmm3,%xmm3
+       xorl    %eax,%r13d
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       xorl    %ecx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       addl    %r12d,%edx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       vpshufd $250,%xmm2,%xmm7
+       shrdl   $2,%r14d,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       shrdl   $9,%r14d,%r14d
+       vpslld  $11,%xmm5,%xmm5
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    52(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       vpsrlq  $17,%xmm7,%xmm7
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       vpaddd  %xmm4,%xmm3,%xmm3
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %r11d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r13d
+       vpshufb %xmm8,%xmm6,%xmm6
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       vpaddd  %xmm6,%xmm3,%xmm3
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       vpshufd $80,%xmm3,%xmm7
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       shrdl   $11,%r14d,%r14d
+       vpsrld  $10,%xmm7,%xmm6
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %edx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %ebx,%r9d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       vpshufb %xmm9,%xmm6,%xmm6
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       vpaddd  %xmm6,%xmm3,%xmm3
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       vpaddd  96(%rbp),%xmm3,%xmm6
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       vmovdqa %xmm6,48(%rsp)
+       cmpb    $0,131(%rbp)
+       jne     L$avx_00_47
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       addl    4(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r9d,%ebx
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       addl    20(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %ebx,%r9d
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       addl    36(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r9d,%ebx
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       addl    52(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %ebx,%r9d
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movq    64+0(%rsp),%rdi
+       movl    %r14d,%eax
+
+       addl    0(%rdi),%eax
+       leaq    64(%rsi),%rsi
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       addl    24(%rdi),%r10d
+       addl    28(%rdi),%r11d
+
+       cmpq    64+16(%rsp),%rsi
+
+       movl    %eax,0(%rdi)
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+       jb      L$loop_avx
+
+       movq    88(%rsp),%rsi
+
+       vzeroupper
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
+L$epilogue_avx:
+       .byte   0xf3,0xc3
+
+
+
+.p2align       6
+sha256_block_data_order_avx2:
+
+L$avx2_shortcut:
+       movq    %rsp,%rax
+
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       pushq   %r15
+
+       subq    $544,%rsp
+       shlq    $4,%rdx
+       andq    $-1024,%rsp
+       leaq    (%rsi,%rdx,4),%rdx
+       addq    $448,%rsp
+       movq    %rdi,64+0(%rsp)
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+       movq    %rax,88(%rsp)
+
+L$prologue_avx2:
+
+       vzeroupper
+       subq    $-64,%rsi
+       movl    0(%rdi),%eax
+       movq    %rsi,%r12
+       movl    4(%rdi),%ebx
+       cmpq    %rdx,%rsi
+       movl    8(%rdi),%ecx
+       cmoveq  %rsp,%r12
+       movl    12(%rdi),%edx
+       movl    16(%rdi),%r8d
+       movl    20(%rdi),%r9d
+       movl    24(%rdi),%r10d
+       movl    28(%rdi),%r11d
+       vmovdqa K256+512+32(%rip),%ymm8
+       vmovdqa K256+512+64(%rip),%ymm9
+       jmp     L$oop_avx2
+.p2align       4
+L$oop_avx2:
+       vmovdqa K256+512(%rip),%ymm7
+       vmovdqu -64+0(%rsi),%xmm0
+       vmovdqu -64+16(%rsi),%xmm1
+       vmovdqu -64+32(%rsi),%xmm2
+       vmovdqu -64+48(%rsi),%xmm3
+
+       vinserti128     $1,(%r12),%ymm0,%ymm0
+       vinserti128     $1,16(%r12),%ymm1,%ymm1
+       vpshufb %ymm7,%ymm0,%ymm0
+       vinserti128     $1,32(%r12),%ymm2,%ymm2
+       vpshufb %ymm7,%ymm1,%ymm1
+       vinserti128     $1,48(%r12),%ymm3,%ymm3
+
+       leaq    K256(%rip),%rbp
+       vpshufb %ymm7,%ymm2,%ymm2
+       vpaddd  0(%rbp),%ymm0,%ymm4
+       vpshufb %ymm7,%ymm3,%ymm3
+       vpaddd  32(%rbp),%ymm1,%ymm5
+       vpaddd  64(%rbp),%ymm2,%ymm6
+       vpaddd  96(%rbp),%ymm3,%ymm7
+       vmovdqa %ymm4,0(%rsp)
+       xorl    %r14d,%r14d
+       vmovdqa %ymm5,32(%rsp)
+       leaq    -64(%rsp),%rsp
+       movl    %ebx,%edi
+       vmovdqa %ymm6,0(%rsp)
+       xorl    %ecx,%edi
+       vmovdqa %ymm7,32(%rsp)
+       movl    %r9d,%r12d
+       subq    $-32*4,%rbp
+       jmp     L$avx2_00_47
+
+.p2align       4
+L$avx2_00_47:
+       leaq    -64(%rsp),%rsp
+       vpalignr        $4,%ymm0,%ymm1,%ymm4
+       addl    0+128(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       vpalignr        $4,%ymm2,%ymm3,%ymm7
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       vpaddd  %ymm7,%ymm0,%ymm0
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       vpshufd $250,%ymm3,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    4+128(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       vpaddd  %ymm4,%ymm0,%ymm0
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    8+128(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       vpshufb %ymm8,%ymm6,%ymm6
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       vpaddd  %ymm6,%ymm0,%ymm0
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       vpshufd $80,%ymm0,%ymm7
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       vpsrld  $10,%ymm7,%ymm6
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       vpsrlq  $17,%ymm7,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    12+128(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       vpshufb %ymm9,%ymm6,%ymm6
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       vpaddd  %ymm6,%ymm0,%ymm0
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       vpaddd  0(%rbp),%ymm0,%ymm6
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       vmovdqa %ymm6,0(%rsp)
+       vpalignr        $4,%ymm1,%ymm2,%ymm4
+       addl    32+128(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       vpalignr        $4,%ymm3,%ymm0,%ymm7
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       vpaddd  %ymm7,%ymm1,%ymm1
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       vpshufd $250,%ymm0,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    36+128(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       vpaddd  %ymm4,%ymm1,%ymm1
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    40+128(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       vpshufb %ymm8,%ymm6,%ymm6
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       vpaddd  %ymm6,%ymm1,%ymm1
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       vpshufd $80,%ymm1,%ymm7
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       vpsrld  $10,%ymm7,%ymm6
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       vpsrlq  $17,%ymm7,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    44+128(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       vpshufb %ymm9,%ymm6,%ymm6
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       vpaddd  %ymm6,%ymm1,%ymm1
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       vpaddd  32(%rbp),%ymm1,%ymm6
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       vmovdqa %ymm6,32(%rsp)
+       leaq    -64(%rsp),%rsp
+       vpalignr        $4,%ymm2,%ymm3,%ymm4
+       addl    0+128(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       vpalignr        $4,%ymm0,%ymm1,%ymm7
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       vpaddd  %ymm7,%ymm2,%ymm2
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       vpshufd $250,%ymm1,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    4+128(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       vpaddd  %ymm4,%ymm2,%ymm2
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    8+128(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       vpshufb %ymm8,%ymm6,%ymm6
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       vpaddd  %ymm6,%ymm2,%ymm2
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       vpshufd $80,%ymm2,%ymm7
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       vpsrld  $10,%ymm7,%ymm6
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       vpsrlq  $17,%ymm7,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    12+128(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       vpshufb %ymm9,%ymm6,%ymm6
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       vpaddd  %ymm6,%ymm2,%ymm2
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       vpaddd  64(%rbp),%ymm2,%ymm6
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       vmovdqa %ymm6,0(%rsp)
+       vpalignr        $4,%ymm3,%ymm0,%ymm4
+       addl    32+128(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       vpalignr        $4,%ymm1,%ymm2,%ymm7
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       vpaddd  %ymm7,%ymm3,%ymm3
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       vpshufd $250,%ymm2,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    36+128(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       vpaddd  %ymm4,%ymm3,%ymm3
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    40+128(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       vpshufb %ymm8,%ymm6,%ymm6
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       vpaddd  %ymm6,%ymm3,%ymm3
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       vpshufd $80,%ymm3,%ymm7
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       vpsrld  $10,%ymm7,%ymm6
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       vpsrlq  $17,%ymm7,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    44+128(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       vpshufb %ymm9,%ymm6,%ymm6
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       vpaddd  %ymm6,%ymm3,%ymm3
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       vpaddd  96(%rbp),%ymm3,%ymm6
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       vmovdqa %ymm6,32(%rsp)
+       leaq    128(%rbp),%rbp
+       cmpb    $0,3(%rbp)
+       jne     L$avx2_00_47
+       addl    0+64(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       addl    4+64(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       addl    8+64(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       addl    12+64(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       addl    32+64(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       addl    36+64(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       addl    40+64(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       addl    44+64(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       addl    0(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       addl    4(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       addl    8(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       addl    12(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       addl    32(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       addl    36(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       addl    40(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       addl    44(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       movq    512(%rsp),%rdi
+       addl    %r14d,%eax
+
+       leaq    448(%rsp),%rbp
+
+       addl    0(%rdi),%eax
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       addl    24(%rdi),%r10d
+       addl    28(%rdi),%r11d
+
+       movl    %eax,0(%rdi)
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+
+       cmpq    80(%rbp),%rsi
+       je      L$done_avx2
+
+       xorl    %r14d,%r14d
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       movl    %r9d,%r12d
+       jmp     L$ower_avx2
+.p2align       4
+L$ower_avx2:
+       addl    0+16(%rbp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       addl    4+16(%rbp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       addl    8+16(%rbp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       addl    12+16(%rbp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       addl    32+16(%rbp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       addl    36+16(%rbp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       addl    40+16(%rbp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       addl    44+16(%rbp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       leaq    -64(%rbp),%rbp
+       cmpq    %rsp,%rbp
+       jae     L$ower_avx2
+
+       movq    512(%rsp),%rdi
+       addl    %r14d,%eax
+
+       leaq    448(%rsp),%rsp
+
+       addl    0(%rdi),%eax
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       leaq    128(%rsi),%rsi
+       addl    24(%rdi),%r10d
+       movq    %rsi,%r12
+       addl    28(%rdi),%r11d
+       cmpq    64+16(%rsp),%rsi
+
+       movl    %eax,0(%rdi)
+       cmoveq  %rsp,%r12
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+
+       jbe     L$oop_avx2
+       leaq    (%rsp),%rbp
+
+L$done_avx2:
+       leaq    (%rbp),%rsp
+       movq    88(%rsp),%rsi
+
+       vzeroupper
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
+L$epilogue_avx2:
+       .byte   0xf3,0xc3
+
+
+
index 0014a8116b85b89b202e5e7e10db5eea106dd2be..4e60bb45f61e5197762df0b0472ce351283dd83d 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,6 @@
 #
 # *** This file is auto-generated ***
 #
-.file  "sha512-586.s"
 .text
 .globl _sha512_block_data_order
 .align 4
@@ -593,6 +592,8 @@ L001K512:
 .long  4234509866,1501505948
 .long  987167468,1607167915
 .long  1246189591,1816402316
+.long  67438087,66051
+.long  202182159,134810123
 .byte  83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
 .byte  110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
 .byte  67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
index 7e73227c2b84ef1fd6ce1a8a82b38847ac4b7c9f..8bf161601e54a4547564c096e94fd84e1d880c03 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 .text  
 
 
-.globl _sha256_block_data_order
+.globl _sha512_block_data_order
 
 .p2align       4
-_sha256_block_data_order:
+_sha512_block_data_order:
+
        leaq    __gnutls_x86_cpuid_s(%rip),%r11
        movl    0(%r11),%r9d
        movl    4(%r11),%r10d
        movl    8(%r11),%r11d
-       testl   $512,%r10d
-       jnz     L$ssse3_shortcut
+       testl   $2048,%r10d
+       jnz     L$xop_shortcut
+       andl    $296,%r11d
+       cmpl    $296,%r11d
+       je      L$avx2_shortcut
+       andl    $1073741824,%r9d
+       andl    $268435968,%r10d
+       orl     %r9d,%r10d
+       cmpl    $1342177792,%r10d
+       je      L$avx_shortcut
+       movq    %rsp,%rax
+
        pushq   %rbx
+
        pushq   %rbp
+
        pushq   %r12
+
        pushq   %r13
+
        pushq   %r14
+
        pushq   %r15
-       movq    %rsp,%r11
+
        shlq    $4,%rdx
-       subq    $64+32,%rsp
-       leaq    (%rsi,%rdx,4),%rdx
+       subq    $128+32,%rsp
+       leaq    (%rsi,%rdx,8),%rdx
        andq    $-64,%rsp
-       movq    %rdi,64+0(%rsp)
-       movq    %rsi,64+8(%rsp)
-       movq    %rdx,64+16(%rsp)
-       movq    %r11,64+24(%rsp)
+       movq    %rdi,128+0(%rsp)
+       movq    %rsi,128+8(%rsp)
+       movq    %rdx,128+16(%rsp)
+       movq    %rax,152(%rsp)
+
 L$prologue:
 
-       movl    0(%rdi),%eax
-       movl    4(%rdi),%ebx
-       movl    8(%rdi),%ecx
-       movl    12(%rdi),%edx
-       movl    16(%rdi),%r8d
-       movl    20(%rdi),%r9d
-       movl    24(%rdi),%r10d
-       movl    28(%rdi),%r11d
+       movq    0(%rdi),%rax
+       movq    8(%rdi),%rbx
+       movq    16(%rdi),%rcx
+       movq    24(%rdi),%rdx
+       movq    32(%rdi),%r8
+       movq    40(%rdi),%r9
+       movq    48(%rdi),%r10
+       movq    56(%rdi),%r11
        jmp     L$loop
 
 .p2align       4
 L$loop:
-       movl    %ebx,%edi
-       leaq    K256(%rip),%rbp
-       xorl    %ecx,%edi
-       movl    0(%rsi),%r12d
-       movl    %r8d,%r13d
-       movl    %eax,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r9d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r8d,%r13d
-       xorl    %r10d,%r15d
-
-       movl    %r12d,0(%rsp)
-       xorl    %eax,%r14d
-       andl    %r8d,%r15d
-
-       rorl    $5,%r13d
-       addl    %r11d,%r12d
-       xorl    %r10d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r8d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %eax,%r15d
-       addl    (%rbp),%r12d
-       xorl    %eax,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ebx,%r15d
-       movl    %ebx,%r11d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r11d
-       addl    %r12d,%edx
-       addl    %r12d,%r11d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r11d
-
-       movl    4(%rsi),%r12d
-       movl    %edx,%r13d
-       movl    %r11d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r8d,%edi
-
-       rorl    $9,%r14d
-       xorl    %edx,%r13d
-       xorl    %r9d,%edi
-
-       movl    %r12d,4(%rsp)
-       xorl    %r11d,%r14d
-       andl    %edx,%edi
-
-       rorl    $5,%r13d
-       addl    %r10d,%r12d
-       xorl    %r9d,%edi
-
-       rorl    $11,%r14d
-       xorl    %edx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r11d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r11d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %eax,%edi
-       movl    %eax,%r10d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r10d
-       addl    %r12d,%ecx
-       addl    %r12d,%r10d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r10d
-
-       movl    8(%rsi),%r12d
-       movl    %ecx,%r13d
-       movl    %r10d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %edx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %ecx,%r13d
-       xorl    %r8d,%r15d
-
-       movl    %r12d,8(%rsp)
-       xorl    %r10d,%r14d
-       andl    %ecx,%r15d
-
-       rorl    $5,%r13d
-       addl    %r9d,%r12d
-       xorl    %r8d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %ecx,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r10d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r10d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r11d,%r15d
-       movl    %r11d,%r9d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r9d
-       addl    %r12d,%ebx
-       addl    %r12d,%r9d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r9d
-
-       movl    12(%rsi),%r12d
-       movl    %ebx,%r13d
-       movl    %r9d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %ecx,%edi
-
-       rorl    $9,%r14d
-       xorl    %ebx,%r13d
-       xorl    %edx,%edi
-
-       movl    %r12d,12(%rsp)
-       xorl    %r9d,%r14d
-       andl    %ebx,%edi
-
-       rorl    $5,%r13d
-       addl    %r8d,%r12d
-       xorl    %edx,%edi
-
-       rorl    $11,%r14d
-       xorl    %ebx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r9d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r9d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r10d,%edi
-       movl    %r10d,%r8d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r8d
-       addl    %r12d,%eax
-       addl    %r12d,%r8d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%r8d
-
-       movl    16(%rsi),%r12d
-       movl    %eax,%r13d
-       movl    %r8d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %ebx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %eax,%r13d
-       xorl    %ecx,%r15d
-
-       movl    %r12d,16(%rsp)
-       xorl    %r8d,%r14d
-       andl    %eax,%r15d
-
-       rorl    $5,%r13d
-       addl    %edx,%r12d
-       xorl    %ecx,%r15d
-
-       rorl    $11,%r14d
-       xorl    %eax,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r8d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r8d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r9d,%r15d
-       movl    %r9d,%edx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%edx
-       addl    %r12d,%r11d
-       addl    %r12d,%edx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%edx
-
-       movl    20(%rsi),%r12d
-       movl    %r11d,%r13d
-       movl    %edx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %eax,%edi
-
-       rorl    $9,%r14d
-       xorl    %r11d,%r13d
-       xorl    %ebx,%edi
-
-       movl    %r12d,20(%rsp)
-       xorl    %edx,%r14d
-       andl    %r11d,%edi
-
-       rorl    $5,%r13d
-       addl    %ecx,%r12d
-       xorl    %ebx,%edi
-
-       rorl    $11,%r14d
-       xorl    %r11d,%r13d
-       addl    %edi,%r12d
-
-       movl    %edx,%edi
-       addl    (%rbp),%r12d
-       xorl    %edx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r8d,%edi
-       movl    %r8d,%ecx
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%ecx
-       addl    %r12d,%r10d
-       addl    %r12d,%ecx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ecx
-
-       movl    24(%rsi),%r12d
-       movl    %r10d,%r13d
-       movl    %ecx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r11d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r10d,%r13d
-       xorl    %eax,%r15d
-
-       movl    %r12d,24(%rsp)
-       xorl    %ecx,%r14d
-       andl    %r10d,%r15d
-
-       rorl    $5,%r13d
-       addl    %ebx,%r12d
-       xorl    %eax,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r10d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %ecx,%r15d
-       addl    (%rbp),%r12d
-       xorl    %ecx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %edx,%r15d
-       movl    %edx,%ebx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%ebx
-       addl    %r12d,%r9d
-       addl    %r12d,%ebx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ebx
-
-       movl    28(%rsi),%r12d
-       movl    %r9d,%r13d
-       movl    %ebx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r10d,%edi
-
-       rorl    $9,%r14d
-       xorl    %r9d,%r13d
-       xorl    %r11d,%edi
-
-       movl    %r12d,28(%rsp)
-       xorl    %ebx,%r14d
-       andl    %r9d,%edi
-
-       rorl    $5,%r13d
-       addl    %eax,%r12d
-       xorl    %r11d,%edi
-
-       rorl    $11,%r14d
-       xorl    %r9d,%r13d
-       addl    %edi,%r12d
-
-       movl    %ebx,%edi
-       addl    (%rbp),%r12d
-       xorl    %ebx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ecx,%edi
-       movl    %ecx,%eax
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%eax
-       addl    %r12d,%r8d
-       addl    %r12d,%eax
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%eax
-
-       movl    32(%rsi),%r12d
-       movl    %r8d,%r13d
-       movl    %eax,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r9d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r8d,%r13d
-       xorl    %r10d,%r15d
-
-       movl    %r12d,32(%rsp)
-       xorl    %eax,%r14d
-       andl    %r8d,%r15d
-
-       rorl    $5,%r13d
-       addl    %r11d,%r12d
-       xorl    %r10d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r8d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %eax,%r15d
-       addl    (%rbp),%r12d
-       xorl    %eax,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ebx,%r15d
-       movl    %ebx,%r11d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r11d
-       addl    %r12d,%edx
-       addl    %r12d,%r11d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r11d
-
-       movl    36(%rsi),%r12d
-       movl    %edx,%r13d
-       movl    %r11d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r8d,%edi
-
-       rorl    $9,%r14d
-       xorl    %edx,%r13d
-       xorl    %r9d,%edi
-
-       movl    %r12d,36(%rsp)
-       xorl    %r11d,%r14d
-       andl    %edx,%edi
-
-       rorl    $5,%r13d
-       addl    %r10d,%r12d
-       xorl    %r9d,%edi
-
-       rorl    $11,%r14d
-       xorl    %edx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r11d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r11d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %eax,%edi
-       movl    %eax,%r10d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r10d
-       addl    %r12d,%ecx
-       addl    %r12d,%r10d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r10d
-
-       movl    40(%rsi),%r12d
-       movl    %ecx,%r13d
-       movl    %r10d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %edx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %ecx,%r13d
-       xorl    %r8d,%r15d
-
-       movl    %r12d,40(%rsp)
-       xorl    %r10d,%r14d
-       andl    %ecx,%r15d
-
-       rorl    $5,%r13d
-       addl    %r9d,%r12d
-       xorl    %r8d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %ecx,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r10d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r10d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r11d,%r15d
-       movl    %r11d,%r9d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r9d
-       addl    %r12d,%ebx
-       addl    %r12d,%r9d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r9d
-
-       movl    44(%rsi),%r12d
-       movl    %ebx,%r13d
-       movl    %r9d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %ecx,%edi
-
-       rorl    $9,%r14d
-       xorl    %ebx,%r13d
-       xorl    %edx,%edi
-
-       movl    %r12d,44(%rsp)
-       xorl    %r9d,%r14d
-       andl    %ebx,%edi
-
-       rorl    $5,%r13d
-       addl    %r8d,%r12d
-       xorl    %edx,%edi
-
-       rorl    $11,%r14d
-       xorl    %ebx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r9d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r9d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r10d,%edi
-       movl    %r10d,%r8d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r8d
-       addl    %r12d,%eax
-       addl    %r12d,%r8d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%r8d
-
-       movl    48(%rsi),%r12d
-       movl    %eax,%r13d
-       movl    %r8d,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %ebx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %eax,%r13d
-       xorl    %ecx,%r15d
-
-       movl    %r12d,48(%rsp)
-       xorl    %r8d,%r14d
-       andl    %eax,%r15d
-
-       rorl    $5,%r13d
-       addl    %edx,%r12d
-       xorl    %ecx,%r15d
-
-       rorl    $11,%r14d
-       xorl    %eax,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r8d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r8d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r9d,%r15d
-       movl    %r9d,%edx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%edx
-       addl    %r12d,%r11d
-       addl    %r12d,%edx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%edx
-
-       movl    52(%rsi),%r12d
-       movl    %r11d,%r13d
-       movl    %edx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %eax,%edi
-
-       rorl    $9,%r14d
-       xorl    %r11d,%r13d
-       xorl    %ebx,%edi
-
-       movl    %r12d,52(%rsp)
-       xorl    %edx,%r14d
-       andl    %r11d,%edi
-
-       rorl    $5,%r13d
-       addl    %ecx,%r12d
-       xorl    %ebx,%edi
-
-       rorl    $11,%r14d
-       xorl    %r11d,%r13d
-       addl    %edi,%r12d
-
-       movl    %edx,%edi
-       addl    (%rbp),%r12d
-       xorl    %edx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r8d,%edi
-       movl    %r8d,%ecx
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%ecx
-       addl    %r12d,%r10d
-       addl    %r12d,%ecx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ecx
-
-       movl    56(%rsi),%r12d
-       movl    %r10d,%r13d
-       movl    %ecx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r11d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r10d,%r13d
-       xorl    %eax,%r15d
-
-       movl    %r12d,56(%rsp)
-       xorl    %ecx,%r14d
-       andl    %r10d,%r15d
-
-       rorl    $5,%r13d
-       addl    %ebx,%r12d
-       xorl    %eax,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r10d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %ecx,%r15d
-       addl    (%rbp),%r12d
-       xorl    %ecx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %edx,%r15d
-       movl    %edx,%ebx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%ebx
-       addl    %r12d,%r9d
-       addl    %r12d,%ebx
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ebx
-
-       movl    60(%rsi),%r12d
-       movl    %r9d,%r13d
-       movl    %ebx,%r14d
-       bswapl  %r12d
-       rorl    $14,%r13d
-       movl    %r10d,%edi
-
-       rorl    $9,%r14d
-       xorl    %r9d,%r13d
-       xorl    %r11d,%edi
-
-       movl    %r12d,60(%rsp)
-       xorl    %ebx,%r14d
-       andl    %r9d,%edi
-
-       rorl    $5,%r13d
-       addl    %eax,%r12d
-       xorl    %r11d,%edi
-
-       rorl    $11,%r14d
-       xorl    %r9d,%r13d
-       addl    %edi,%r12d
-
-       movl    %ebx,%edi
-       addl    (%rbp),%r12d
-       xorl    %ebx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ecx,%edi
-       movl    %ecx,%eax
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%eax
-       addl    %r12d,%r8d
-       addl    %r12d,%eax
-       movl    4(%rsp),%r13d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%eax
-
+       movq    %rbx,%rdi
+       leaq    K512(%rip),%rbp
+       xorq    %rcx,%rdi
+       movq    0(%rsi),%r12
+       movq    %r8,%r13
+       movq    %rax,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r9,%r15
+
+       xorq    %r8,%r13
+       rorq    $5,%r14
+       xorq    %r10,%r15
+
+       movq    %r12,0(%rsp)
+       xorq    %rax,%r14
+       andq    %r8,%r15
+
+       rorq    $4,%r13
+       addq    %r11,%r12
+       xorq    %r10,%r15
+
+       rorq    $6,%r14
+       xorq    %r8,%r13
+       addq    %r15,%r12
+
+       movq    %rax,%r15
+       addq    (%rbp),%r12
+       xorq    %rax,%r14
+
+       xorq    %rbx,%r15
+       rorq    $14,%r13
+       movq    %rbx,%r11
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r11
+       addq    %r12,%rdx
+       addq    %r12,%r11
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%r11
+       movq    8(%rsi),%r12
+       movq    %rdx,%r13
+       movq    %r11,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r8,%rdi
+
+       xorq    %rdx,%r13
+       rorq    $5,%r14
+       xorq    %r9,%rdi
+
+       movq    %r12,8(%rsp)
+       xorq    %r11,%r14
+       andq    %rdx,%rdi
+
+       rorq    $4,%r13
+       addq    %r10,%r12
+       xorq    %r9,%rdi
+
+       rorq    $6,%r14
+       xorq    %rdx,%r13
+       addq    %rdi,%r12
+
+       movq    %r11,%rdi
+       addq    (%rbp),%r12
+       xorq    %r11,%r14
+
+       xorq    %rax,%rdi
+       rorq    $14,%r13
+       movq    %rax,%r10
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r10
+       addq    %r12,%rcx
+       addq    %r12,%r10
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%r10
+       movq    16(%rsi),%r12
+       movq    %rcx,%r13
+       movq    %r10,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rdx,%r15
+
+       xorq    %rcx,%r13
+       rorq    $5,%r14
+       xorq    %r8,%r15
+
+       movq    %r12,16(%rsp)
+       xorq    %r10,%r14
+       andq    %rcx,%r15
+
+       rorq    $4,%r13
+       addq    %r9,%r12
+       xorq    %r8,%r15
+
+       rorq    $6,%r14
+       xorq    %rcx,%r13
+       addq    %r15,%r12
+
+       movq    %r10,%r15
+       addq    (%rbp),%r12
+       xorq    %r10,%r14
+
+       xorq    %r11,%r15
+       rorq    $14,%r13
+       movq    %r11,%r9
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r9
+       addq    %r12,%rbx
+       addq    %r12,%r9
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%r9
+       movq    24(%rsi),%r12
+       movq    %rbx,%r13
+       movq    %r9,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rcx,%rdi
+
+       xorq    %rbx,%r13
+       rorq    $5,%r14
+       xorq    %rdx,%rdi
+
+       movq    %r12,24(%rsp)
+       xorq    %r9,%r14
+       andq    %rbx,%rdi
+
+       rorq    $4,%r13
+       addq    %r8,%r12
+       xorq    %rdx,%rdi
+
+       rorq    $6,%r14
+       xorq    %rbx,%r13
+       addq    %rdi,%r12
+
+       movq    %r9,%rdi
+       addq    (%rbp),%r12
+       xorq    %r9,%r14
+
+       xorq    %r10,%rdi
+       rorq    $14,%r13
+       movq    %r10,%r8
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r8
+       addq    %r12,%rax
+       addq    %r12,%r8
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%r8
+       movq    32(%rsi),%r12
+       movq    %rax,%r13
+       movq    %r8,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rbx,%r15
+
+       xorq    %rax,%r13
+       rorq    $5,%r14
+       xorq    %rcx,%r15
+
+       movq    %r12,32(%rsp)
+       xorq    %r8,%r14
+       andq    %rax,%r15
+
+       rorq    $4,%r13
+       addq    %rdx,%r12
+       xorq    %rcx,%r15
+
+       rorq    $6,%r14
+       xorq    %rax,%r13
+       addq    %r15,%r12
+
+       movq    %r8,%r15
+       addq    (%rbp),%r12
+       xorq    %r8,%r14
+
+       xorq    %r9,%r15
+       rorq    $14,%r13
+       movq    %r9,%rdx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rdx
+       addq    %r12,%r11
+       addq    %r12,%rdx
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%rdx
+       movq    40(%rsi),%r12
+       movq    %r11,%r13
+       movq    %rdx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rax,%rdi
+
+       xorq    %r11,%r13
+       rorq    $5,%r14
+       xorq    %rbx,%rdi
+
+       movq    %r12,40(%rsp)
+       xorq    %rdx,%r14
+       andq    %r11,%rdi
+
+       rorq    $4,%r13
+       addq    %rcx,%r12
+       xorq    %rbx,%rdi
+
+       rorq    $6,%r14
+       xorq    %r11,%r13
+       addq    %rdi,%r12
+
+       movq    %rdx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rdx,%r14
+
+       xorq    %r8,%rdi
+       rorq    $14,%r13
+       movq    %r8,%rcx
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rcx
+       addq    %r12,%r10
+       addq    %r12,%rcx
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%rcx
+       movq    48(%rsi),%r12
+       movq    %r10,%r13
+       movq    %rcx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r11,%r15
+
+       xorq    %r10,%r13
+       rorq    $5,%r14
+       xorq    %rax,%r15
+
+       movq    %r12,48(%rsp)
+       xorq    %rcx,%r14
+       andq    %r10,%r15
+
+       rorq    $4,%r13
+       addq    %rbx,%r12
+       xorq    %rax,%r15
+
+       rorq    $6,%r14
+       xorq    %r10,%r13
+       addq    %r15,%r12
+
+       movq    %rcx,%r15
+       addq    (%rbp),%r12
+       xorq    %rcx,%r14
+
+       xorq    %rdx,%r15
+       rorq    $14,%r13
+       movq    %rdx,%rbx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rbx
+       addq    %r12,%r9
+       addq    %r12,%rbx
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%rbx
+       movq    56(%rsi),%r12
+       movq    %r9,%r13
+       movq    %rbx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r10,%rdi
+
+       xorq    %r9,%r13
+       rorq    $5,%r14
+       xorq    %r11,%rdi
+
+       movq    %r12,56(%rsp)
+       xorq    %rbx,%r14
+       andq    %r9,%rdi
+
+       rorq    $4,%r13
+       addq    %rax,%r12
+       xorq    %r11,%rdi
+
+       rorq    $6,%r14
+       xorq    %r9,%r13
+       addq    %rdi,%r12
+
+       movq    %rbx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rbx,%r14
+
+       xorq    %rcx,%rdi
+       rorq    $14,%r13
+       movq    %rcx,%rax
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rax
+       addq    %r12,%r8
+       addq    %r12,%rax
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%rax
+       movq    64(%rsi),%r12
+       movq    %r8,%r13
+       movq    %rax,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r9,%r15
+
+       xorq    %r8,%r13
+       rorq    $5,%r14
+       xorq    %r10,%r15
+
+       movq    %r12,64(%rsp)
+       xorq    %rax,%r14
+       andq    %r8,%r15
+
+       rorq    $4,%r13
+       addq    %r11,%r12
+       xorq    %r10,%r15
+
+       rorq    $6,%r14
+       xorq    %r8,%r13
+       addq    %r15,%r12
+
+       movq    %rax,%r15
+       addq    (%rbp),%r12
+       xorq    %rax,%r14
+
+       xorq    %rbx,%r15
+       rorq    $14,%r13
+       movq    %rbx,%r11
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r11
+       addq    %r12,%rdx
+       addq    %r12,%r11
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%r11
+       movq    72(%rsi),%r12
+       movq    %rdx,%r13
+       movq    %r11,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r8,%rdi
+
+       xorq    %rdx,%r13
+       rorq    $5,%r14
+       xorq    %r9,%rdi
+
+       movq    %r12,72(%rsp)
+       xorq    %r11,%r14
+       andq    %rdx,%rdi
+
+       rorq    $4,%r13
+       addq    %r10,%r12
+       xorq    %r9,%rdi
+
+       rorq    $6,%r14
+       xorq    %rdx,%r13
+       addq    %rdi,%r12
+
+       movq    %r11,%rdi
+       addq    (%rbp),%r12
+       xorq    %r11,%r14
+
+       xorq    %rax,%rdi
+       rorq    $14,%r13
+       movq    %rax,%r10
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r10
+       addq    %r12,%rcx
+       addq    %r12,%r10
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%r10
+       movq    80(%rsi),%r12
+       movq    %rcx,%r13
+       movq    %r10,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rdx,%r15
+
+       xorq    %rcx,%r13
+       rorq    $5,%r14
+       xorq    %r8,%r15
+
+       movq    %r12,80(%rsp)
+       xorq    %r10,%r14
+       andq    %rcx,%r15
+
+       rorq    $4,%r13
+       addq    %r9,%r12
+       xorq    %r8,%r15
+
+       rorq    $6,%r14
+       xorq    %rcx,%r13
+       addq    %r15,%r12
+
+       movq    %r10,%r15
+       addq    (%rbp),%r12
+       xorq    %r10,%r14
+
+       xorq    %r11,%r15
+       rorq    $14,%r13
+       movq    %r11,%r9
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r9
+       addq    %r12,%rbx
+       addq    %r12,%r9
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%r9
+       movq    88(%rsi),%r12
+       movq    %rbx,%r13
+       movq    %r9,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rcx,%rdi
+
+       xorq    %rbx,%r13
+       rorq    $5,%r14
+       xorq    %rdx,%rdi
+
+       movq    %r12,88(%rsp)
+       xorq    %r9,%r14
+       andq    %rbx,%rdi
+
+       rorq    $4,%r13
+       addq    %r8,%r12
+       xorq    %rdx,%rdi
+
+       rorq    $6,%r14
+       xorq    %rbx,%r13
+       addq    %rdi,%r12
+
+       movq    %r9,%rdi
+       addq    (%rbp),%r12
+       xorq    %r9,%r14
+
+       xorq    %r10,%rdi
+       rorq    $14,%r13
+       movq    %r10,%r8
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r8
+       addq    %r12,%rax
+       addq    %r12,%r8
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%r8
+       movq    96(%rsi),%r12
+       movq    %rax,%r13
+       movq    %r8,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rbx,%r15
+
+       xorq    %rax,%r13
+       rorq    $5,%r14
+       xorq    %rcx,%r15
+
+       movq    %r12,96(%rsp)
+       xorq    %r8,%r14
+       andq    %rax,%r15
+
+       rorq    $4,%r13
+       addq    %rdx,%r12
+       xorq    %rcx,%r15
+
+       rorq    $6,%r14
+       xorq    %rax,%r13
+       addq    %r15,%r12
+
+       movq    %r8,%r15
+       addq    (%rbp),%r12
+       xorq    %r8,%r14
+
+       xorq    %r9,%r15
+       rorq    $14,%r13
+       movq    %r9,%rdx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rdx
+       addq    %r12,%r11
+       addq    %r12,%rdx
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%rdx
+       movq    104(%rsi),%r12
+       movq    %r11,%r13
+       movq    %rdx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rax,%rdi
+
+       xorq    %r11,%r13
+       rorq    $5,%r14
+       xorq    %rbx,%rdi
+
+       movq    %r12,104(%rsp)
+       xorq    %rdx,%r14
+       andq    %r11,%rdi
+
+       rorq    $4,%r13
+       addq    %rcx,%r12
+       xorq    %rbx,%rdi
+
+       rorq    $6,%r14
+       xorq    %r11,%r13
+       addq    %rdi,%r12
+
+       movq    %rdx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rdx,%r14
+
+       xorq    %r8,%rdi
+       rorq    $14,%r13
+       movq    %r8,%rcx
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rcx
+       addq    %r12,%r10
+       addq    %r12,%rcx
+
+       leaq    24(%rbp),%rbp
+       addq    %r14,%rcx
+       movq    112(%rsi),%r12
+       movq    %r10,%r13
+       movq    %rcx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r11,%r15
+
+       xorq    %r10,%r13
+       rorq    $5,%r14
+       xorq    %rax,%r15
+
+       movq    %r12,112(%rsp)
+       xorq    %rcx,%r14
+       andq    %r10,%r15
+
+       rorq    $4,%r13
+       addq    %rbx,%r12
+       xorq    %rax,%r15
+
+       rorq    $6,%r14
+       xorq    %r10,%r13
+       addq    %r15,%r12
+
+       movq    %rcx,%r15
+       addq    (%rbp),%r12
+       xorq    %rcx,%r14
+
+       xorq    %rdx,%r15
+       rorq    $14,%r13
+       movq    %rdx,%rbx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rbx
+       addq    %r12,%r9
+       addq    %r12,%rbx
+
+       leaq    8(%rbp),%rbp
+       addq    %r14,%rbx
+       movq    120(%rsi),%r12
+       movq    %r9,%r13
+       movq    %rbx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r10,%rdi
+
+       xorq    %r9,%r13
+       rorq    $5,%r14
+       xorq    %r11,%rdi
+
+       movq    %r12,120(%rsp)
+       xorq    %rbx,%r14
+       andq    %r9,%rdi
+
+       rorq    $4,%r13
+       addq    %rax,%r12
+       xorq    %r11,%rdi
+
+       rorq    $6,%r14
+       xorq    %r9,%r13
+       addq    %rdi,%r12
+
+       movq    %rbx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rbx,%r14
+
+       xorq    %rcx,%rdi
+       rorq    $14,%r13
+       movq    %rcx,%rax
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rax
+       addq    %r12,%r8
+       addq    %r12,%rax
+
+       leaq    24(%rbp),%rbp
        jmp     L$rounds_16_xx
 .p2align       4
 L$rounds_16_xx:
-
-       movl    56(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    36(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    0(%rsp),%r12d
-       movl    %r8d,%r13d
-       addl    %r14d,%r12d
-       movl    %eax,%r14d
-       rorl    $14,%r13d
-       movl    %r9d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r8d,%r13d
-       xorl    %r10d,%r15d
-
-       movl    %r12d,0(%rsp)
-       xorl    %eax,%r14d
-       andl    %r8d,%r15d
-
-       rorl    $5,%r13d
-       addl    %r11d,%r12d
-       xorl    %r10d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r8d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %eax,%r15d
-       addl    (%rbp),%r12d
-       xorl    %eax,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ebx,%r15d
-       movl    %ebx,%r11d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r11d
-       addl    %r12d,%edx
-       addl    %r12d,%r11d
-       movl    8(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r11d
-
-
-       movl    60(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    40(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    4(%rsp),%r12d
-       movl    %edx,%r13d
-       addl    %r14d,%r12d
-       movl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r8d,%edi
-
-       rorl    $9,%r14d
-       xorl    %edx,%r13d
-       xorl    %r9d,%edi
-
-       movl    %r12d,4(%rsp)
-       xorl    %r11d,%r14d
-       andl    %edx,%edi
-
-       rorl    $5,%r13d
-       addl    %r10d,%r12d
-       xorl    %r9d,%edi
-
-       rorl    $11,%r14d
-       xorl    %edx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r11d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r11d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %eax,%edi
-       movl    %eax,%r10d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r10d
-       addl    %r12d,%ecx
-       addl    %r12d,%r10d
-       movl    12(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r10d
-
-
-       movl    0(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    44(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    8(%rsp),%r12d
-       movl    %ecx,%r13d
-       addl    %r14d,%r12d
-       movl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %edx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %ecx,%r13d
-       xorl    %r8d,%r15d
-
-       movl    %r12d,8(%rsp)
-       xorl    %r10d,%r14d
-       andl    %ecx,%r15d
-
-       rorl    $5,%r13d
-       addl    %r9d,%r12d
-       xorl    %r8d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %ecx,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r10d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r10d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r11d,%r15d
-       movl    %r11d,%r9d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r9d
-       addl    %r12d,%ebx
-       addl    %r12d,%r9d
-       movl    16(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r9d
-
-
-       movl    4(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    48(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    12(%rsp),%r12d
-       movl    %ebx,%r13d
-       addl    %r14d,%r12d
-       movl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %ecx,%edi
-
-       rorl    $9,%r14d
-       xorl    %ebx,%r13d
-       xorl    %edx,%edi
-
-       movl    %r12d,12(%rsp)
-       xorl    %r9d,%r14d
-       andl    %ebx,%edi
-
-       rorl    $5,%r13d
-       addl    %r8d,%r12d
-       xorl    %edx,%edi
-
-       rorl    $11,%r14d
-       xorl    %ebx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r9d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r9d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r10d,%edi
-       movl    %r10d,%r8d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r8d
-       addl    %r12d,%eax
-       addl    %r12d,%r8d
-       movl    20(%rsp),%r13d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%r8d
-
-
-       movl    8(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    52(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    16(%rsp),%r12d
-       movl    %eax,%r13d
-       addl    %r14d,%r12d
-       movl    %r8d,%r14d
-       rorl    $14,%r13d
-       movl    %ebx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %eax,%r13d
-       xorl    %ecx,%r15d
-
-       movl    %r12d,16(%rsp)
-       xorl    %r8d,%r14d
-       andl    %eax,%r15d
-
-       rorl    $5,%r13d
-       addl    %edx,%r12d
-       xorl    %ecx,%r15d
-
-       rorl    $11,%r14d
-       xorl    %eax,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r8d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r8d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r9d,%r15d
-       movl    %r9d,%edx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%edx
-       addl    %r12d,%r11d
-       addl    %r12d,%edx
-       movl    24(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%edx
-
-
-       movl    12(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    56(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    20(%rsp),%r12d
-       movl    %r11d,%r13d
-       addl    %r14d,%r12d
-       movl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %eax,%edi
-
-       rorl    $9,%r14d
-       xorl    %r11d,%r13d
-       xorl    %ebx,%edi
-
-       movl    %r12d,20(%rsp)
-       xorl    %edx,%r14d
-       andl    %r11d,%edi
-
-       rorl    $5,%r13d
-       addl    %ecx,%r12d
-       xorl    %ebx,%edi
-
-       rorl    $11,%r14d
-       xorl    %r11d,%r13d
-       addl    %edi,%r12d
-
-       movl    %edx,%edi
-       addl    (%rbp),%r12d
-       xorl    %edx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r8d,%edi
-       movl    %r8d,%ecx
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%ecx
-       addl    %r12d,%r10d
-       addl    %r12d,%ecx
-       movl    28(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ecx
-
-
-       movl    16(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    60(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    24(%rsp),%r12d
-       movl    %r10d,%r13d
-       addl    %r14d,%r12d
-       movl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r11d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r10d,%r13d
-       xorl    %eax,%r15d
-
-       movl    %r12d,24(%rsp)
-       xorl    %ecx,%r14d
-       andl    %r10d,%r15d
-
-       rorl    $5,%r13d
-       addl    %ebx,%r12d
-       xorl    %eax,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r10d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %ecx,%r15d
-       addl    (%rbp),%r12d
-       xorl    %ecx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %edx,%r15d
-       movl    %edx,%ebx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%ebx
-       addl    %r12d,%r9d
-       addl    %r12d,%ebx
-       movl    32(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ebx
-
-
-       movl    20(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    0(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    28(%rsp),%r12d
-       movl    %r9d,%r13d
-       addl    %r14d,%r12d
-       movl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r10d,%edi
-
-       rorl    $9,%r14d
-       xorl    %r9d,%r13d
-       xorl    %r11d,%edi
-
-       movl    %r12d,28(%rsp)
-       xorl    %ebx,%r14d
-       andl    %r9d,%edi
-
-       rorl    $5,%r13d
-       addl    %eax,%r12d
-       xorl    %r11d,%edi
-
-       rorl    $11,%r14d
-       xorl    %r9d,%r13d
-       addl    %edi,%r12d
-
-       movl    %ebx,%edi
-       addl    (%rbp),%r12d
-       xorl    %ebx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ecx,%edi
-       movl    %ecx,%eax
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%eax
-       addl    %r12d,%r8d
-       addl    %r12d,%eax
-       movl    36(%rsp),%r13d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%eax
-
-
-       movl    24(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    4(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    32(%rsp),%r12d
-       movl    %r8d,%r13d
-       addl    %r14d,%r12d
-       movl    %eax,%r14d
-       rorl    $14,%r13d
-       movl    %r9d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r8d,%r13d
-       xorl    %r10d,%r15d
-
-       movl    %r12d,32(%rsp)
-       xorl    %eax,%r14d
-       andl    %r8d,%r15d
-
-       rorl    $5,%r13d
-       addl    %r11d,%r12d
-       xorl    %r10d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r8d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %eax,%r15d
-       addl    (%rbp),%r12d
-       xorl    %eax,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ebx,%r15d
-       movl    %ebx,%r11d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r11d
-       addl    %r12d,%edx
-       addl    %r12d,%r11d
-       movl    40(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r11d
-
-
-       movl    28(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    8(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    36(%rsp),%r12d
-       movl    %edx,%r13d
-       addl    %r14d,%r12d
-       movl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r8d,%edi
-
-       rorl    $9,%r14d
-       xorl    %edx,%r13d
-       xorl    %r9d,%edi
-
-       movl    %r12d,36(%rsp)
-       xorl    %r11d,%r14d
-       andl    %edx,%edi
-
-       rorl    $5,%r13d
-       addl    %r10d,%r12d
-       xorl    %r9d,%edi
-
-       rorl    $11,%r14d
-       xorl    %edx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r11d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r11d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %eax,%edi
-       movl    %eax,%r10d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r10d
-       addl    %r12d,%ecx
-       addl    %r12d,%r10d
-       movl    44(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r10d
-
-
-       movl    32(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    12(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    40(%rsp),%r12d
-       movl    %ecx,%r13d
-       addl    %r14d,%r12d
-       movl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %edx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %ecx,%r13d
-       xorl    %r8d,%r15d
-
-       movl    %r12d,40(%rsp)
-       xorl    %r10d,%r14d
-       andl    %ecx,%r15d
-
-       rorl    $5,%r13d
-       addl    %r9d,%r12d
-       xorl    %r8d,%r15d
-
-       rorl    $11,%r14d
-       xorl    %ecx,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r10d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r10d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r11d,%r15d
-       movl    %r11d,%r9d
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%r9d
-       addl    %r12d,%ebx
-       addl    %r12d,%r9d
-       movl    48(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%r9d
-
-
-       movl    36(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    16(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    44(%rsp),%r12d
-       movl    %ebx,%r13d
-       addl    %r14d,%r12d
-       movl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %ecx,%edi
-
-       rorl    $9,%r14d
-       xorl    %ebx,%r13d
-       xorl    %edx,%edi
-
-       movl    %r12d,44(%rsp)
-       xorl    %r9d,%r14d
-       andl    %ebx,%edi
-
-       rorl    $5,%r13d
-       addl    %r8d,%r12d
-       xorl    %edx,%edi
-
-       rorl    $11,%r14d
-       xorl    %ebx,%r13d
-       addl    %edi,%r12d
-
-       movl    %r9d,%edi
-       addl    (%rbp),%r12d
-       xorl    %r9d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r10d,%edi
-       movl    %r10d,%r8d
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%r8d
-       addl    %r12d,%eax
-       addl    %r12d,%r8d
-       movl    52(%rsp),%r13d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%r8d
-
-
-       movl    40(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    20(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    48(%rsp),%r12d
-       movl    %eax,%r13d
-       addl    %r14d,%r12d
-       movl    %r8d,%r14d
-       rorl    $14,%r13d
-       movl    %ebx,%r15d
-
-       rorl    $9,%r14d
-       xorl    %eax,%r13d
-       xorl    %ecx,%r15d
-
-       movl    %r12d,48(%rsp)
-       xorl    %r8d,%r14d
-       andl    %eax,%r15d
-
-       rorl    $5,%r13d
-       addl    %edx,%r12d
-       xorl    %ecx,%r15d
-
-       rorl    $11,%r14d
-       xorl    %eax,%r13d
-       addl    %r15d,%r12d
-
-       movl    %r8d,%r15d
-       addl    (%rbp),%r12d
-       xorl    %r8d,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r9d,%r15d
-       movl    %r9d,%edx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%edx
-       addl    %r12d,%r11d
-       addl    %r12d,%edx
-       movl    56(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%edx
-
-
-       movl    44(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    24(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    52(%rsp),%r12d
-       movl    %r11d,%r13d
-       addl    %r14d,%r12d
-       movl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %eax,%edi
-
-       rorl    $9,%r14d
-       xorl    %r11d,%r13d
-       xorl    %ebx,%edi
-
-       movl    %r12d,52(%rsp)
-       xorl    %edx,%r14d
-       andl    %r11d,%edi
-
-       rorl    $5,%r13d
-       addl    %ecx,%r12d
-       xorl    %ebx,%edi
-
-       rorl    $11,%r14d
-       xorl    %r11d,%r13d
-       addl    %edi,%r12d
-
-       movl    %edx,%edi
-       addl    (%rbp),%r12d
-       xorl    %edx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %r8d,%edi
-       movl    %r8d,%ecx
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%ecx
-       addl    %r12d,%r10d
-       addl    %r12d,%ecx
-       movl    60(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ecx
-
-
-       movl    48(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%r15d
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %r15d,%r14d
-       shrl    $10,%r15d
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    28(%rsp),%r12d
-       xorl    %r15d,%r14d
-
-       addl    56(%rsp),%r12d
-       movl    %r10d,%r13d
-       addl    %r14d,%r12d
-       movl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r11d,%r15d
-
-       rorl    $9,%r14d
-       xorl    %r10d,%r13d
-       xorl    %eax,%r15d
-
-       movl    %r12d,56(%rsp)
-       xorl    %ecx,%r14d
-       andl    %r10d,%r15d
-
-       rorl    $5,%r13d
-       addl    %ebx,%r12d
-       xorl    %eax,%r15d
-
-       rorl    $11,%r14d
-       xorl    %r10d,%r13d
-       addl    %r15d,%r12d
-
-       movl    %ecx,%r15d
-       addl    (%rbp),%r12d
-       xorl    %ecx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %edx,%r15d
-       movl    %edx,%ebx
-
-       rorl    $2,%r14d
-       andl    %r15d,%edi
-       addl    %r13d,%r12d
-
-       xorl    %edi,%ebx
-       addl    %r12d,%r9d
-       addl    %r12d,%ebx
-       movl    0(%rsp),%r13d
-       leaq    4(%rbp),%rbp
-       addl    %r14d,%ebx
-
-
-       movl    52(%rsp),%r14d
-
-       movl    %r13d,%r12d
-       rorl    $11,%r13d
-       movl    %r14d,%edi
-       rorl    $2,%r14d
-
-       xorl    %r12d,%r13d
-       shrl    $3,%r12d
-       rorl    $7,%r13d
-       xorl    %edi,%r14d
-       shrl    $10,%edi
-
-       xorl    %r13d,%r12d
-       rorl    $17,%r14d
-       addl    32(%rsp),%r12d
-       xorl    %edi,%r14d
-
-       addl    60(%rsp),%r12d
-       movl    %r9d,%r13d
-       addl    %r14d,%r12d
-       movl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r10d,%edi
-
-       rorl    $9,%r14d
-       xorl    %r9d,%r13d
-       xorl    %r11d,%edi
-
-       movl    %r12d,60(%rsp)
-       xorl    %ebx,%r14d
-       andl    %r9d,%edi
-
-       rorl    $5,%r13d
-       addl    %eax,%r12d
-       xorl    %r11d,%edi
-
-       rorl    $11,%r14d
-       xorl    %r9d,%r13d
-       addl    %edi,%r12d
-
-       movl    %ebx,%edi
-       addl    (%rbp),%r12d
-       xorl    %ebx,%r14d
-
-       rorl    $6,%r13d
-       xorl    %ecx,%edi
-       movl    %ecx,%eax
-
-       rorl    $2,%r14d
-       andl    %edi,%r15d
-       addl    %r13d,%r12d
-
-       xorl    %r15d,%eax
-       addl    %r12d,%r8d
-       addl    %r12d,%eax
-       movl    4(%rsp),%r13d
-       leaq    20(%rbp),%rbp
-       addl    %r14d,%eax
-
-       cmpb    $0,3(%rbp)
+       movq    8(%rsp),%r13
+       movq    112(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rax
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    72(%rsp),%r12
+
+       addq    0(%rsp),%r12
+       movq    %r8,%r13
+       addq    %r15,%r12
+       movq    %rax,%r14
+       rorq    $23,%r13
+       movq    %r9,%r15
+
+       xorq    %r8,%r13
+       rorq    $5,%r14
+       xorq    %r10,%r15
+
+       movq    %r12,0(%rsp)
+       xorq    %rax,%r14
+       andq    %r8,%r15
+
+       rorq    $4,%r13
+       addq    %r11,%r12
+       xorq    %r10,%r15
+
+       rorq    $6,%r14
+       xorq    %r8,%r13
+       addq    %r15,%r12
+
+       movq    %rax,%r15
+       addq    (%rbp),%r12
+       xorq    %rax,%r14
+
+       xorq    %rbx,%r15
+       rorq    $14,%r13
+       movq    %rbx,%r11
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r11
+       addq    %r12,%rdx
+       addq    %r12,%r11
+
+       leaq    8(%rbp),%rbp
+       movq    16(%rsp),%r13
+       movq    120(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r11
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    80(%rsp),%r12
+
+       addq    8(%rsp),%r12
+       movq    %rdx,%r13
+       addq    %rdi,%r12
+       movq    %r11,%r14
+       rorq    $23,%r13
+       movq    %r8,%rdi
+
+       xorq    %rdx,%r13
+       rorq    $5,%r14
+       xorq    %r9,%rdi
+
+       movq    %r12,8(%rsp)
+       xorq    %r11,%r14
+       andq    %rdx,%rdi
+
+       rorq    $4,%r13
+       addq    %r10,%r12
+       xorq    %r9,%rdi
+
+       rorq    $6,%r14
+       xorq    %rdx,%r13
+       addq    %rdi,%r12
+
+       movq    %r11,%rdi
+       addq    (%rbp),%r12
+       xorq    %r11,%r14
+
+       xorq    %rax,%rdi
+       rorq    $14,%r13
+       movq    %rax,%r10
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r10
+       addq    %r12,%rcx
+       addq    %r12,%r10
+
+       leaq    24(%rbp),%rbp
+       movq    24(%rsp),%r13
+       movq    0(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r10
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    88(%rsp),%r12
+
+       addq    16(%rsp),%r12
+       movq    %rcx,%r13
+       addq    %r15,%r12
+       movq    %r10,%r14
+       rorq    $23,%r13
+       movq    %rdx,%r15
+
+       xorq    %rcx,%r13
+       rorq    $5,%r14
+       xorq    %r8,%r15
+
+       movq    %r12,16(%rsp)
+       xorq    %r10,%r14
+       andq    %rcx,%r15
+
+       rorq    $4,%r13
+       addq    %r9,%r12
+       xorq    %r8,%r15
+
+       rorq    $6,%r14
+       xorq    %rcx,%r13
+       addq    %r15,%r12
+
+       movq    %r10,%r15
+       addq    (%rbp),%r12
+       xorq    %r10,%r14
+
+       xorq    %r11,%r15
+       rorq    $14,%r13
+       movq    %r11,%r9
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r9
+       addq    %r12,%rbx
+       addq    %r12,%r9
+
+       leaq    8(%rbp),%rbp
+       movq    32(%rsp),%r13
+       movq    8(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r9
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    96(%rsp),%r12
+
+       addq    24(%rsp),%r12
+       movq    %rbx,%r13
+       addq    %rdi,%r12
+       movq    %r9,%r14
+       rorq    $23,%r13
+       movq    %rcx,%rdi
+
+       xorq    %rbx,%r13
+       rorq    $5,%r14
+       xorq    %rdx,%rdi
+
+       movq    %r12,24(%rsp)
+       xorq    %r9,%r14
+       andq    %rbx,%rdi
+
+       rorq    $4,%r13
+       addq    %r8,%r12
+       xorq    %rdx,%rdi
+
+       rorq    $6,%r14
+       xorq    %rbx,%r13
+       addq    %rdi,%r12
+
+       movq    %r9,%rdi
+       addq    (%rbp),%r12
+       xorq    %r9,%r14
+
+       xorq    %r10,%rdi
+       rorq    $14,%r13
+       movq    %r10,%r8
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r8
+       addq    %r12,%rax
+       addq    %r12,%r8
+
+       leaq    24(%rbp),%rbp
+       movq    40(%rsp),%r13
+       movq    16(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r8
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    104(%rsp),%r12
+
+       addq    32(%rsp),%r12
+       movq    %rax,%r13
+       addq    %r15,%r12
+       movq    %r8,%r14
+       rorq    $23,%r13
+       movq    %rbx,%r15
+
+       xorq    %rax,%r13
+       rorq    $5,%r14
+       xorq    %rcx,%r15
+
+       movq    %r12,32(%rsp)
+       xorq    %r8,%r14
+       andq    %rax,%r15
+
+       rorq    $4,%r13
+       addq    %rdx,%r12
+       xorq    %rcx,%r15
+
+       rorq    $6,%r14
+       xorq    %rax,%r13
+       addq    %r15,%r12
+
+       movq    %r8,%r15
+       addq    (%rbp),%r12
+       xorq    %r8,%r14
+
+       xorq    %r9,%r15
+       rorq    $14,%r13
+       movq    %r9,%rdx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rdx
+       addq    %r12,%r11
+       addq    %r12,%rdx
+
+       leaq    8(%rbp),%rbp
+       movq    48(%rsp),%r13
+       movq    24(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rdx
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    112(%rsp),%r12
+
+       addq    40(%rsp),%r12
+       movq    %r11,%r13
+       addq    %rdi,%r12
+       movq    %rdx,%r14
+       rorq    $23,%r13
+       movq    %rax,%rdi
+
+       xorq    %r11,%r13
+       rorq    $5,%r14
+       xorq    %rbx,%rdi
+
+       movq    %r12,40(%rsp)
+       xorq    %rdx,%r14
+       andq    %r11,%rdi
+
+       rorq    $4,%r13
+       addq    %rcx,%r12
+       xorq    %rbx,%rdi
+
+       rorq    $6,%r14
+       xorq    %r11,%r13
+       addq    %rdi,%r12
+
+       movq    %rdx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rdx,%r14
+
+       xorq    %r8,%rdi
+       rorq    $14,%r13
+       movq    %r8,%rcx
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rcx
+       addq    %r12,%r10
+       addq    %r12,%rcx
+
+       leaq    24(%rbp),%rbp
+       movq    56(%rsp),%r13
+       movq    32(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rcx
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    120(%rsp),%r12
+
+       addq    48(%rsp),%r12
+       movq    %r10,%r13
+       addq    %r15,%r12
+       movq    %rcx,%r14
+       rorq    $23,%r13
+       movq    %r11,%r15
+
+       xorq    %r10,%r13
+       rorq    $5,%r14
+       xorq    %rax,%r15
+
+       movq    %r12,48(%rsp)
+       xorq    %rcx,%r14
+       andq    %r10,%r15
+
+       rorq    $4,%r13
+       addq    %rbx,%r12
+       xorq    %rax,%r15
+
+       rorq    $6,%r14
+       xorq    %r10,%r13
+       addq    %r15,%r12
+
+       movq    %rcx,%r15
+       addq    (%rbp),%r12
+       xorq    %rcx,%r14
+
+       xorq    %rdx,%r15
+       rorq    $14,%r13
+       movq    %rdx,%rbx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rbx
+       addq    %r12,%r9
+       addq    %r12,%rbx
+
+       leaq    8(%rbp),%rbp
+       movq    64(%rsp),%r13
+       movq    40(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rbx
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    0(%rsp),%r12
+
+       addq    56(%rsp),%r12
+       movq    %r9,%r13
+       addq    %rdi,%r12
+       movq    %rbx,%r14
+       rorq    $23,%r13
+       movq    %r10,%rdi
+
+       xorq    %r9,%r13
+       rorq    $5,%r14
+       xorq    %r11,%rdi
+
+       movq    %r12,56(%rsp)
+       xorq    %rbx,%r14
+       andq    %r9,%rdi
+
+       rorq    $4,%r13
+       addq    %rax,%r12
+       xorq    %r11,%rdi
+
+       rorq    $6,%r14
+       xorq    %r9,%r13
+       addq    %rdi,%r12
+
+       movq    %rbx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rbx,%r14
+
+       xorq    %rcx,%rdi
+       rorq    $14,%r13
+       movq    %rcx,%rax
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rax
+       addq    %r12,%r8
+       addq    %r12,%rax
+
+       leaq    24(%rbp),%rbp
+       movq    72(%rsp),%r13
+       movq    48(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rax
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    8(%rsp),%r12
+
+       addq    64(%rsp),%r12
+       movq    %r8,%r13
+       addq    %r15,%r12
+       movq    %rax,%r14
+       rorq    $23,%r13
+       movq    %r9,%r15
+
+       xorq    %r8,%r13
+       rorq    $5,%r14
+       xorq    %r10,%r15
+
+       movq    %r12,64(%rsp)
+       xorq    %rax,%r14
+       andq    %r8,%r15
+
+       rorq    $4,%r13
+       addq    %r11,%r12
+       xorq    %r10,%r15
+
+       rorq    $6,%r14
+       xorq    %r8,%r13
+       addq    %r15,%r12
+
+       movq    %rax,%r15
+       addq    (%rbp),%r12
+       xorq    %rax,%r14
+
+       xorq    %rbx,%r15
+       rorq    $14,%r13
+       movq    %rbx,%r11
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r11
+       addq    %r12,%rdx
+       addq    %r12,%r11
+
+       leaq    8(%rbp),%rbp
+       movq    80(%rsp),%r13
+       movq    56(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r11
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    16(%rsp),%r12
+
+       addq    72(%rsp),%r12
+       movq    %rdx,%r13
+       addq    %rdi,%r12
+       movq    %r11,%r14
+       rorq    $23,%r13
+       movq    %r8,%rdi
+
+       xorq    %rdx,%r13
+       rorq    $5,%r14
+       xorq    %r9,%rdi
+
+       movq    %r12,72(%rsp)
+       xorq    %r11,%r14
+       andq    %rdx,%rdi
+
+       rorq    $4,%r13
+       addq    %r10,%r12
+       xorq    %r9,%rdi
+
+       rorq    $6,%r14
+       xorq    %rdx,%r13
+       addq    %rdi,%r12
+
+       movq    %r11,%rdi
+       addq    (%rbp),%r12
+       xorq    %r11,%r14
+
+       xorq    %rax,%rdi
+       rorq    $14,%r13
+       movq    %rax,%r10
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r10
+       addq    %r12,%rcx
+       addq    %r12,%r10
+
+       leaq    24(%rbp),%rbp
+       movq    88(%rsp),%r13
+       movq    64(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r10
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    24(%rsp),%r12
+
+       addq    80(%rsp),%r12
+       movq    %rcx,%r13
+       addq    %r15,%r12
+       movq    %r10,%r14
+       rorq    $23,%r13
+       movq    %rdx,%r15
+
+       xorq    %rcx,%r13
+       rorq    $5,%r14
+       xorq    %r8,%r15
+
+       movq    %r12,80(%rsp)
+       xorq    %r10,%r14
+       andq    %rcx,%r15
+
+       rorq    $4,%r13
+       addq    %r9,%r12
+       xorq    %r8,%r15
+
+       rorq    $6,%r14
+       xorq    %rcx,%r13
+       addq    %r15,%r12
+
+       movq    %r10,%r15
+       addq    (%rbp),%r12
+       xorq    %r10,%r14
+
+       xorq    %r11,%r15
+       rorq    $14,%r13
+       movq    %r11,%r9
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%r9
+       addq    %r12,%rbx
+       addq    %r12,%r9
+
+       leaq    8(%rbp),%rbp
+       movq    96(%rsp),%r13
+       movq    72(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r9
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    32(%rsp),%r12
+
+       addq    88(%rsp),%r12
+       movq    %rbx,%r13
+       addq    %rdi,%r12
+       movq    %r9,%r14
+       rorq    $23,%r13
+       movq    %rcx,%rdi
+
+       xorq    %rbx,%r13
+       rorq    $5,%r14
+       xorq    %rdx,%rdi
+
+       movq    %r12,88(%rsp)
+       xorq    %r9,%r14
+       andq    %rbx,%rdi
+
+       rorq    $4,%r13
+       addq    %r8,%r12
+       xorq    %rdx,%rdi
+
+       rorq    $6,%r14
+       xorq    %rbx,%r13
+       addq    %rdi,%r12
+
+       movq    %r9,%rdi
+       addq    (%rbp),%r12
+       xorq    %r9,%r14
+
+       xorq    %r10,%rdi
+       rorq    $14,%r13
+       movq    %r10,%r8
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%r8
+       addq    %r12,%rax
+       addq    %r12,%r8
+
+       leaq    24(%rbp),%rbp
+       movq    104(%rsp),%r13
+       movq    80(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r8
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    40(%rsp),%r12
+
+       addq    96(%rsp),%r12
+       movq    %rax,%r13
+       addq    %r15,%r12
+       movq    %r8,%r14
+       rorq    $23,%r13
+       movq    %rbx,%r15
+
+       xorq    %rax,%r13
+       rorq    $5,%r14
+       xorq    %rcx,%r15
+
+       movq    %r12,96(%rsp)
+       xorq    %r8,%r14
+       andq    %rax,%r15
+
+       rorq    $4,%r13
+       addq    %rdx,%r12
+       xorq    %rcx,%r15
+
+       rorq    $6,%r14
+       xorq    %rax,%r13
+       addq    %r15,%r12
+
+       movq    %r8,%r15
+       addq    (%rbp),%r12
+       xorq    %r8,%r14
+
+       xorq    %r9,%r15
+       rorq    $14,%r13
+       movq    %r9,%rdx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rdx
+       addq    %r12,%r11
+       addq    %r12,%rdx
+
+       leaq    8(%rbp),%rbp
+       movq    112(%rsp),%r13
+       movq    88(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rdx
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    48(%rsp),%r12
+
+       addq    104(%rsp),%r12
+       movq    %r11,%r13
+       addq    %rdi,%r12
+       movq    %rdx,%r14
+       rorq    $23,%r13
+       movq    %rax,%rdi
+
+       xorq    %r11,%r13
+       rorq    $5,%r14
+       xorq    %rbx,%rdi
+
+       movq    %r12,104(%rsp)
+       xorq    %rdx,%r14
+       andq    %r11,%rdi
+
+       rorq    $4,%r13
+       addq    %rcx,%r12
+       xorq    %rbx,%rdi
+
+       rorq    $6,%r14
+       xorq    %r11,%r13
+       addq    %rdi,%r12
+
+       movq    %rdx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rdx,%r14
+
+       xorq    %r8,%rdi
+       rorq    $14,%r13
+       movq    %r8,%rcx
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rcx
+       addq    %r12,%r10
+       addq    %r12,%rcx
+
+       leaq    24(%rbp),%rbp
+       movq    120(%rsp),%r13
+       movq    96(%rsp),%r15
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rcx
+       movq    %r15,%r14
+       rorq    $42,%r15
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    56(%rsp),%r12
+
+       addq    112(%rsp),%r12
+       movq    %r10,%r13
+       addq    %r15,%r12
+       movq    %rcx,%r14
+       rorq    $23,%r13
+       movq    %r11,%r15
+
+       xorq    %r10,%r13
+       rorq    $5,%r14
+       xorq    %rax,%r15
+
+       movq    %r12,112(%rsp)
+       xorq    %rcx,%r14
+       andq    %r10,%r15
+
+       rorq    $4,%r13
+       addq    %rbx,%r12
+       xorq    %rax,%r15
+
+       rorq    $6,%r14
+       xorq    %r10,%r13
+       addq    %r15,%r12
+
+       movq    %rcx,%r15
+       addq    (%rbp),%r12
+       xorq    %rcx,%r14
+
+       xorq    %rdx,%r15
+       rorq    $14,%r13
+       movq    %rdx,%rbx
+
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %rdi,%rbx
+       addq    %r12,%r9
+       addq    %r12,%rbx
+
+       leaq    8(%rbp),%rbp
+       movq    0(%rsp),%r13
+       movq    104(%rsp),%rdi
+
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rbx
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    64(%rsp),%r12
+
+       addq    120(%rsp),%r12
+       movq    %r9,%r13
+       addq    %rdi,%r12
+       movq    %rbx,%r14
+       rorq    $23,%r13
+       movq    %r10,%rdi
+
+       xorq    %r9,%r13
+       rorq    $5,%r14
+       xorq    %r11,%rdi
+
+       movq    %r12,120(%rsp)
+       xorq    %rbx,%r14
+       andq    %r9,%rdi
+
+       rorq    $4,%r13
+       addq    %rax,%r12
+       xorq    %r11,%rdi
+
+       rorq    $6,%r14
+       xorq    %r9,%r13
+       addq    %rdi,%r12
+
+       movq    %rbx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rbx,%r14
+
+       xorq    %rcx,%rdi
+       rorq    $14,%r13
+       movq    %rcx,%rax
+
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+
+       xorq    %r15,%rax
+       addq    %r12,%r8
+       addq    %r12,%rax
+
+       leaq    24(%rbp),%rbp
+       cmpb    $0,7(%rbp)
        jnz     L$rounds_16_xx
 
-       movq    64+0(%rsp),%rdi
-       leaq    64(%rsi),%rsi
-
-       addl    0(%rdi),%eax
-       addl    4(%rdi),%ebx
-       addl    8(%rdi),%ecx
-       addl    12(%rdi),%edx
-       addl    16(%rdi),%r8d
-       addl    20(%rdi),%r9d
-       addl    24(%rdi),%r10d
-       addl    28(%rdi),%r11d
-
-       cmpq    64+16(%rsp),%rsi
-
-       movl    %eax,0(%rdi)
-       movl    %ebx,4(%rdi)
-       movl    %ecx,8(%rdi)
-       movl    %edx,12(%rdi)
-       movl    %r8d,16(%rdi)
-       movl    %r9d,20(%rdi)
-       movl    %r10d,24(%rdi)
-       movl    %r11d,28(%rdi)
+       movq    128+0(%rsp),%rdi
+       addq    %r14,%rax
+       leaq    128(%rsi),%rsi
+
+       addq    0(%rdi),%rax
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       addq    48(%rdi),%r10
+       addq    56(%rdi),%r11
+
+       cmpq    128+16(%rsp),%rsi
+
+       movq    %rax,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
        jb      L$loop
 
-       movq    64+24(%rsp),%rsi
-       movq    (%rsi),%r15
-       movq    8(%rsi),%r14
-       movq    16(%rsi),%r13
-       movq    24(%rsi),%r12
-       movq    32(%rsi),%rbp
-       movq    40(%rsi),%rbx
-       leaq    48(%rsi),%rsp
+       movq    152(%rsp),%rsi
+
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
 L$epilogue:
        .byte   0xf3,0xc3
 
+
 .p2align       6
 
-K256:
-.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
-.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
-.long  0x03020100,0x0b0a0908,0xffffffff,0xffffffff
-.long  0x03020100,0x0b0a0908,0xffffffff,0xffffffff
-.long  0xffffffff,0xffffffff,0x03020100,0x0b0a0908
-.long  0xffffffff,0xffffffff,0x03020100,0x0b0a0908
-.byte  83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+K512:
+.quad  0x428a2f98d728ae22,0x7137449123ef65cd
+.quad  0x428a2f98d728ae22,0x7137449123ef65cd
+.quad  0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad  0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad  0x3956c25bf348b538,0x59f111f1b605d019
+.quad  0x3956c25bf348b538,0x59f111f1b605d019
+.quad  0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad  0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad  0xd807aa98a3030242,0x12835b0145706fbe
+.quad  0xd807aa98a3030242,0x12835b0145706fbe
+.quad  0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad  0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad  0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad  0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad  0x9bdc06a725c71235,0xc19bf174cf692694
+.quad  0x9bdc06a725c71235,0xc19bf174cf692694
+.quad  0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad  0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad  0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad  0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad  0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad  0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad  0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad  0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad  0x983e5152ee66dfab,0xa831c66d2db43210
+.quad  0x983e5152ee66dfab,0xa831c66d2db43210
+.quad  0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad  0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad  0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad  0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad  0x06ca6351e003826f,0x142929670a0e6e70
+.quad  0x06ca6351e003826f,0x142929670a0e6e70
+.quad  0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad  0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad  0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad  0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad  0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad  0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad  0x81c2c92e47edaee6,0x92722c851482353b
+.quad  0x81c2c92e47edaee6,0x92722c851482353b
+.quad  0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad  0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad  0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad  0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad  0xd192e819d6ef5218,0xd69906245565a910
+.quad  0xd192e819d6ef5218,0xd69906245565a910
+.quad  0xf40e35855771202a,0x106aa07032bbd1b8
+.quad  0xf40e35855771202a,0x106aa07032bbd1b8
+.quad  0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad  0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad  0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad  0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad  0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad  0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad  0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad  0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad  0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad  0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad  0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad  0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad  0x90befffa23631e28,0xa4506cebde82bde9
+.quad  0x90befffa23631e28,0xa4506cebde82bde9
+.quad  0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad  0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad  0xca273eceea26619c,0xd186b8c721c0c207
+.quad  0xca273eceea26619c,0xd186b8c721c0c207
+.quad  0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad  0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad  0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad  0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad  0x113f9804bef90dae,0x1b710b35131c471b
+.quad  0x113f9804bef90dae,0x1b710b35131c471b
+.quad  0x28db77f523047d84,0x32caab7b40c72493
+.quad  0x28db77f523047d84,0x32caab7b40c72493
+.quad  0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad  0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad  0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad  0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad  0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad  0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.quad  0x0001020304050607,0x08090a0b0c0d0e0f
+.quad  0x0001020304050607,0x08090a0b0c0d0e0f
+.byte  83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 
 .p2align       6
-sha256_block_data_order_ssse3:
-L$ssse3_shortcut:
+sha512_block_data_order_xop:
+
+L$xop_shortcut:
+       movq    %rsp,%rax
+
        pushq   %rbx
+
        pushq   %rbp
+
        pushq   %r12
+
        pushq   %r13
+
        pushq   %r14
+
        pushq   %r15
-       movq    %rsp,%r11
+
        shlq    $4,%rdx
-       subq    $96,%rsp
-       leaq    (%rsi,%rdx,4),%rdx
+       subq    $160,%rsp
+       leaq    (%rsi,%rdx,8),%rdx
        andq    $-64,%rsp
-       movq    %rdi,64+0(%rsp)
-       movq    %rsi,64+8(%rsp)
-       movq    %rdx,64+16(%rsp)
-       movq    %r11,64+24(%rsp)
-L$prologue_ssse3:
-
-       movl    0(%rdi),%eax
-       movl    4(%rdi),%ebx
-       movl    8(%rdi),%ecx
-       movl    12(%rdi),%edx
-       movl    16(%rdi),%r8d
-       movl    20(%rdi),%r9d
-       movl    24(%rdi),%r10d
-       movl    28(%rdi),%r11d
-       movdqa  K256+512+32(%rip),%xmm8
-       movdqa  K256+512+64(%rip),%xmm9
-       jmp     L$loop_ssse3
+       movq    %rdi,128+0(%rsp)
+       movq    %rsi,128+8(%rsp)
+       movq    %rdx,128+16(%rsp)
+       movq    %rax,152(%rsp)
+
+L$prologue_xop:
+
+       vzeroupper
+       movq    0(%rdi),%rax
+       movq    8(%rdi),%rbx
+       movq    16(%rdi),%rcx
+       movq    24(%rdi),%rdx
+       movq    32(%rdi),%r8
+       movq    40(%rdi),%r9
+       movq    48(%rdi),%r10
+       movq    56(%rdi),%r11
+       jmp     L$loop_xop
 .p2align       4
-L$loop_ssse3:
-       movdqa  K256+512(%rip),%xmm7
-       movdqu  0(%rsi),%xmm0
-       movdqu  16(%rsi),%xmm1
-       movdqu  32(%rsi),%xmm2
-       movdqu  48(%rsi),%xmm3
-.byte  102,15,56,0,199
-       leaq    K256(%rip),%rbp
-.byte  102,15,56,0,207
-       movdqa  0(%rbp),%xmm4
-.byte  102,15,56,0,215
-       movdqa  32(%rbp),%xmm5
-       paddd   %xmm0,%xmm4
-       movdqa  64(%rbp),%xmm6
-.byte  102,15,56,0,223
-       movdqa  96(%rbp),%xmm7
-       paddd   %xmm1,%xmm5
-       paddd   %xmm2,%xmm6
-       paddd   %xmm3,%xmm7
-       movdqa  %xmm4,0(%rsp)
-       movl    %eax,%r14d
-       movdqa  %xmm5,16(%rsp)
-       movl    %ebx,%edi
-       movdqa  %xmm6,32(%rsp)
-       xorl    %ecx,%edi
-       movdqa  %xmm7,48(%rsp)
-       movl    %r8d,%r13d
-       jmp     L$ssse3_00_47
+L$loop_xop:
+       vmovdqa K512+1280(%rip),%xmm11
+       vmovdqu 0(%rsi),%xmm0
+       leaq    K512+128(%rip),%rbp
+       vmovdqu 16(%rsi),%xmm1
+       vmovdqu 32(%rsi),%xmm2
+       vpshufb %xmm11,%xmm0,%xmm0
+       vmovdqu 48(%rsi),%xmm3
+       vpshufb %xmm11,%xmm1,%xmm1
+       vmovdqu 64(%rsi),%xmm4
+       vpshufb %xmm11,%xmm2,%xmm2
+       vmovdqu 80(%rsi),%xmm5
+       vpshufb %xmm11,%xmm3,%xmm3
+       vmovdqu 96(%rsi),%xmm6
+       vpshufb %xmm11,%xmm4,%xmm4
+       vmovdqu 112(%rsi),%xmm7
+       vpshufb %xmm11,%xmm5,%xmm5
+       vpaddq  -128(%rbp),%xmm0,%xmm8
+       vpshufb %xmm11,%xmm6,%xmm6
+       vpaddq  -96(%rbp),%xmm1,%xmm9
+       vpshufb %xmm11,%xmm7,%xmm7
+       vpaddq  -64(%rbp),%xmm2,%xmm10
+       vpaddq  -32(%rbp),%xmm3,%xmm11
+       vmovdqa %xmm8,0(%rsp)
+       vpaddq  0(%rbp),%xmm4,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       vpaddq  32(%rbp),%xmm5,%xmm9
+       vmovdqa %xmm10,32(%rsp)
+       vpaddq  64(%rbp),%xmm6,%xmm10
+       vmovdqa %xmm11,48(%rsp)
+       vpaddq  96(%rbp),%xmm7,%xmm11
+       vmovdqa %xmm8,64(%rsp)
+       movq    %rax,%r14
+       vmovdqa %xmm9,80(%rsp)
+       movq    %rbx,%rdi
+       vmovdqa %xmm10,96(%rsp)
+       xorq    %rcx,%rdi
+       vmovdqa %xmm11,112(%rsp)
+       movq    %r8,%r13
+       jmp     L$xop_00_47
 
 .p2align       4
-L$ssse3_00_47:
-       subq    $-32*4,%rbp
-       rorl    $14,%r13d
-       movl    %r14d,%eax
-       movdqa  %xmm1,%xmm4
-       movl    %r9d,%r12d
-       movdqa  %xmm3,%xmm7
-       xorl    %r8d,%r13d
-       rorl    $9,%r14d
-       xorl    %r10d,%r12d
-.byte  102,15,58,15,224,4
-       rorl    $5,%r13d
-       xorl    %eax,%r14d
-.byte  102,15,58,15,250,4
-       andl    %r8d,%r12d
-       xorl    %r8d,%r13d
-       addl    0(%rsp),%r11d
-       movl    %eax,%r15d
-       rorl    $11,%r14d
-       xorl    %r10d,%r12d
-       movdqa  %xmm4,%xmm5
-       xorl    %ebx,%r15d
-       movdqa  %xmm4,%xmm6
-       rorl    $6,%r13d
-       addl    %r12d,%r11d
-       andl    %r15d,%edi
-       psrld   $3,%xmm4
-       xorl    %eax,%r14d
-       addl    %r13d,%r11d
-       xorl    %ebx,%edi
-       paddd   %xmm7,%xmm0
-       addl    %r11d,%edx
-       rorl    $2,%r14d
-       addl    %edi,%r11d
-       psrld   $7,%xmm6
-       movl    %edx,%r13d
-       addl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r11d
-       pshufd  $250,%xmm3,%xmm7
-       movl    %r8d,%r12d
-       pslld   $14,%xmm5
-       xorl    %edx,%r13d
-       pxor    %xmm6,%xmm4
-       rorl    $9,%r14d
-       xorl    %r9d,%r12d
-       psrld   $11,%xmm6
-       rorl    $5,%r13d
-       xorl    %r11d,%r14d
-       pxor    %xmm5,%xmm4
-       andl    %edx,%r12d
-       xorl    %edx,%r13d
-       pslld   $11,%xmm5
-       addl    4(%rsp),%r10d
-       pxor    %xmm6,%xmm4
-       movl    %r11d,%edi
-       rorl    $11,%r14d
-       xorl    %r9d,%r12d
-       movdqa  %xmm7,%xmm6
-       xorl    %eax,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r10d
-       pxor    %xmm5,%xmm4
-       andl    %edi,%r15d
-       xorl    %r11d,%r14d
-       psrld   $10,%xmm7
-       addl    %r13d,%r10d
-       xorl    %eax,%r15d
-       paddd   %xmm4,%xmm0
-       addl    %r10d,%ecx
-       rorl    $2,%r14d
-       addl    %r15d,%r10d
-       movl    %ecx,%r13d
-       psrlq   $17,%xmm6
-       addl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r10d
-       movl    %edx,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %ecx,%r13d
-       rorl    $9,%r14d
-       psrlq   $2,%xmm6
-       xorl    %r8d,%r12d
-       rorl    $5,%r13d
-       xorl    %r10d,%r14d
-       pxor    %xmm6,%xmm7
-       andl    %ecx,%r12d
-       xorl    %ecx,%r13d
-       addl    8(%rsp),%r9d
-.byte  102,65,15,56,0,248
-       movl    %r10d,%r15d
-       rorl    $11,%r14d
-       xorl    %r8d,%r12d
-       xorl    %r11d,%r15d
-       rorl    $6,%r13d
-       paddd   %xmm7,%xmm0
-       addl    %r12d,%r9d
-       pshufd  $80,%xmm0,%xmm7
-       andl    %r15d,%edi
-       xorl    %r10d,%r14d
-       addl    %r13d,%r9d
-       xorl    %r11d,%edi
-       movdqa  %xmm7,%xmm6
-       addl    %r9d,%ebx
-       rorl    $2,%r14d
-       addl    %edi,%r9d
-       psrld   $10,%xmm7
-       movl    %ebx,%r13d
-       psrlq   $17,%xmm6
-       addl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r9d
-       movl    %ecx,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %ebx,%r13d
-       rorl    $9,%r14d
-       xorl    %edx,%r12d
-       psrlq   $2,%xmm6
-       rorl    $5,%r13d
-       xorl    %r9d,%r14d
-       andl    %ebx,%r12d
-       xorl    %ebx,%r13d
-       pxor    %xmm6,%xmm7
-       addl    12(%rsp),%r8d
-       movl    %r9d,%edi
-       movdqa  0(%rbp),%xmm6
-       rorl    $11,%r14d
-       xorl    %edx,%r12d
-.byte  102,65,15,56,0,249
-       xorl    %r10d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r8d
-       andl    %edi,%r15d
-       xorl    %r9d,%r14d
-       paddd   %xmm7,%xmm0
-       addl    %r13d,%r8d
-       xorl    %r10d,%r15d
-       addl    %r8d,%eax
-       paddd   %xmm0,%xmm6
-       rorl    $2,%r14d
-       addl    %r15d,%r8d
-       movl    %eax,%r13d
-       addl    %r8d,%r14d
-       movdqa  %xmm6,0(%rsp)
-       rorl    $14,%r13d
-       movl    %r14d,%r8d
-       movdqa  %xmm2,%xmm4
-       movl    %ebx,%r12d
-       movdqa  %xmm0,%xmm7
-       xorl    %eax,%r13d
-       rorl    $9,%r14d
-       xorl    %ecx,%r12d
-.byte  102,15,58,15,225,4
-       rorl    $5,%r13d
-       xorl    %r8d,%r14d
-.byte  102,15,58,15,251,4
-       andl    %eax,%r12d
-       xorl    %eax,%r13d
-       addl    16(%rsp),%edx
-       movl    %r8d,%r15d
-       rorl    $11,%r14d
-       xorl    %ecx,%r12d
-       movdqa  %xmm4,%xmm5
-       xorl    %r9d,%r15d
-       movdqa  %xmm4,%xmm6
-       rorl    $6,%r13d
-       addl    %r12d,%edx
-       andl    %r15d,%edi
-       psrld   $3,%xmm4
-       xorl    %r8d,%r14d
-       addl    %r13d,%edx
-       xorl    %r9d,%edi
-       paddd   %xmm7,%xmm1
-       addl    %edx,%r11d
-       rorl    $2,%r14d
-       addl    %edi,%edx
-       psrld   $7,%xmm6
-       movl    %r11d,%r13d
-       addl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%edx
-       pshufd  $250,%xmm0,%xmm7
-       movl    %eax,%r12d
-       pslld   $14,%xmm5
-       xorl    %r11d,%r13d
-       pxor    %xmm6,%xmm4
-       rorl    $9,%r14d
-       xorl    %ebx,%r12d
-       psrld   $11,%xmm6
-       rorl    $5,%r13d
-       xorl    %edx,%r14d
-       pxor    %xmm5,%xmm4
-       andl    %r11d,%r12d
-       xorl    %r11d,%r13d
-       pslld   $11,%xmm5
-       addl    20(%rsp),%ecx
-       pxor    %xmm6,%xmm4
-       movl    %edx,%edi
-       rorl    $11,%r14d
-       xorl    %ebx,%r12d
-       movdqa  %xmm7,%xmm6
-       xorl    %r8d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%ecx
-       pxor    %xmm5,%xmm4
-       andl    %edi,%r15d
-       xorl    %edx,%r14d
-       psrld   $10,%xmm7
-       addl    %r13d,%ecx
-       xorl    %r8d,%r15d
-       paddd   %xmm4,%xmm1
-       addl    %ecx,%r10d
-       rorl    $2,%r14d
-       addl    %r15d,%ecx
-       movl    %r10d,%r13d
-       psrlq   $17,%xmm6
-       addl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ecx
-       movl    %r11d,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %r10d,%r13d
-       rorl    $9,%r14d
-       psrlq   $2,%xmm6
-       xorl    %eax,%r12d
-       rorl    $5,%r13d
-       xorl    %ecx,%r14d
-       pxor    %xmm6,%xmm7
-       andl    %r10d,%r12d
-       xorl    %r10d,%r13d
-       addl    24(%rsp),%ebx
-.byte  102,65,15,56,0,248
-       movl    %ecx,%r15d
-       rorl    $11,%r14d
-       xorl    %eax,%r12d
-       xorl    %edx,%r15d
-       rorl    $6,%r13d
-       paddd   %xmm7,%xmm1
-       addl    %r12d,%ebx
-       pshufd  $80,%xmm1,%xmm7
-       andl    %r15d,%edi
-       xorl    %ecx,%r14d
-       addl    %r13d,%ebx
-       xorl    %edx,%edi
-       movdqa  %xmm7,%xmm6
-       addl    %ebx,%r9d
-       rorl    $2,%r14d
-       addl    %edi,%ebx
-       psrld   $10,%xmm7
-       movl    %r9d,%r13d
-       psrlq   $17,%xmm6
-       addl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ebx
-       movl    %r10d,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %r9d,%r13d
-       rorl    $9,%r14d
-       xorl    %r11d,%r12d
-       psrlq   $2,%xmm6
-       rorl    $5,%r13d
-       xorl    %ebx,%r14d
-       andl    %r9d,%r12d
-       xorl    %r9d,%r13d
-       pxor    %xmm6,%xmm7
-       addl    28(%rsp),%eax
-       movl    %ebx,%edi
-       movdqa  32(%rbp),%xmm6
-       rorl    $11,%r14d
-       xorl    %r11d,%r12d
-.byte  102,65,15,56,0,249
-       xorl    %ecx,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%eax
-       andl    %edi,%r15d
-       xorl    %ebx,%r14d
-       paddd   %xmm7,%xmm1
-       addl    %r13d,%eax
-       xorl    %ecx,%r15d
-       addl    %eax,%r8d
-       paddd   %xmm1,%xmm6
-       rorl    $2,%r14d
-       addl    %r15d,%eax
-       movl    %r8d,%r13d
-       addl    %eax,%r14d
-       movdqa  %xmm6,16(%rsp)
-       rorl    $14,%r13d
-       movl    %r14d,%eax
-       movdqa  %xmm3,%xmm4
-       movl    %r9d,%r12d
-       movdqa  %xmm1,%xmm7
-       xorl    %r8d,%r13d
-       rorl    $9,%r14d
-       xorl    %r10d,%r12d
-.byte  102,15,58,15,226,4
-       rorl    $5,%r13d
-       xorl    %eax,%r14d
-.byte  102,15,58,15,248,4
-       andl    %r8d,%r12d
-       xorl    %r8d,%r13d
-       addl    32(%rsp),%r11d
-       movl    %eax,%r15d
-       rorl    $11,%r14d
-       xorl    %r10d,%r12d
-       movdqa  %xmm4,%xmm5
-       xorl    %ebx,%r15d
-       movdqa  %xmm4,%xmm6
-       rorl    $6,%r13d
-       addl    %r12d,%r11d
-       andl    %r15d,%edi
-       psrld   $3,%xmm4
-       xorl    %eax,%r14d
-       addl    %r13d,%r11d
-       xorl    %ebx,%edi
-       paddd   %xmm7,%xmm2
-       addl    %r11d,%edx
-       rorl    $2,%r14d
-       addl    %edi,%r11d
-       psrld   $7,%xmm6
-       movl    %edx,%r13d
-       addl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r11d
-       pshufd  $250,%xmm1,%xmm7
-       movl    %r8d,%r12d
-       pslld   $14,%xmm5
-       xorl    %edx,%r13d
-       pxor    %xmm6,%xmm4
-       rorl    $9,%r14d
-       xorl    %r9d,%r12d
-       psrld   $11,%xmm6
-       rorl    $5,%r13d
-       xorl    %r11d,%r14d
-       pxor    %xmm5,%xmm4
-       andl    %edx,%r12d
-       xorl    %edx,%r13d
-       pslld   $11,%xmm5
-       addl    36(%rsp),%r10d
-       pxor    %xmm6,%xmm4
-       movl    %r11d,%edi
-       rorl    $11,%r14d
-       xorl    %r9d,%r12d
-       movdqa  %xmm7,%xmm6
-       xorl    %eax,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r10d
-       pxor    %xmm5,%xmm4
-       andl    %edi,%r15d
-       xorl    %r11d,%r14d
-       psrld   $10,%xmm7
-       addl    %r13d,%r10d
-       xorl    %eax,%r15d
-       paddd   %xmm4,%xmm2
-       addl    %r10d,%ecx
-       rorl    $2,%r14d
-       addl    %r15d,%r10d
-       movl    %ecx,%r13d
-       psrlq   $17,%xmm6
-       addl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r10d
-       movl    %edx,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %ecx,%r13d
-       rorl    $9,%r14d
-       psrlq   $2,%xmm6
-       xorl    %r8d,%r12d
-       rorl    $5,%r13d
-       xorl    %r10d,%r14d
-       pxor    %xmm6,%xmm7
-       andl    %ecx,%r12d
-       xorl    %ecx,%r13d
-       addl    40(%rsp),%r9d
-.byte  102,65,15,56,0,248
-       movl    %r10d,%r15d
-       rorl    $11,%r14d
-       xorl    %r8d,%r12d
-       xorl    %r11d,%r15d
-       rorl    $6,%r13d
-       paddd   %xmm7,%xmm2
-       addl    %r12d,%r9d
-       pshufd  $80,%xmm2,%xmm7
-       andl    %r15d,%edi
-       xorl    %r10d,%r14d
-       addl    %r13d,%r9d
-       xorl    %r11d,%edi
-       movdqa  %xmm7,%xmm6
-       addl    %r9d,%ebx
-       rorl    $2,%r14d
-       addl    %edi,%r9d
-       psrld   $10,%xmm7
-       movl    %ebx,%r13d
-       psrlq   $17,%xmm6
-       addl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r9d
-       movl    %ecx,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %ebx,%r13d
-       rorl    $9,%r14d
-       xorl    %edx,%r12d
-       psrlq   $2,%xmm6
-       rorl    $5,%r13d
-       xorl    %r9d,%r14d
-       andl    %ebx,%r12d
-       xorl    %ebx,%r13d
-       pxor    %xmm6,%xmm7
-       addl    44(%rsp),%r8d
-       movl    %r9d,%edi
-       movdqa  64(%rbp),%xmm6
-       rorl    $11,%r14d
-       xorl    %edx,%r12d
-.byte  102,65,15,56,0,249
-       xorl    %r10d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r8d
-       andl    %edi,%r15d
-       xorl    %r9d,%r14d
-       paddd   %xmm7,%xmm2
-       addl    %r13d,%r8d
-       xorl    %r10d,%r15d
-       addl    %r8d,%eax
-       paddd   %xmm2,%xmm6
-       rorl    $2,%r14d
-       addl    %r15d,%r8d
-       movl    %eax,%r13d
-       addl    %r8d,%r14d
-       movdqa  %xmm6,32(%rsp)
-       rorl    $14,%r13d
-       movl    %r14d,%r8d
-       movdqa  %xmm0,%xmm4
-       movl    %ebx,%r12d
-       movdqa  %xmm2,%xmm7
-       xorl    %eax,%r13d
-       rorl    $9,%r14d
-       xorl    %ecx,%r12d
-.byte  102,15,58,15,227,4
-       rorl    $5,%r13d
-       xorl    %r8d,%r14d
-.byte  102,15,58,15,249,4
-       andl    %eax,%r12d
-       xorl    %eax,%r13d
-       addl    48(%rsp),%edx
-       movl    %r8d,%r15d
-       rorl    $11,%r14d
-       xorl    %ecx,%r12d
-       movdqa  %xmm4,%xmm5
-       xorl    %r9d,%r15d
-       movdqa  %xmm4,%xmm6
-       rorl    $6,%r13d
-       addl    %r12d,%edx
-       andl    %r15d,%edi
-       psrld   $3,%xmm4
-       xorl    %r8d,%r14d
-       addl    %r13d,%edx
-       xorl    %r9d,%edi
-       paddd   %xmm7,%xmm3
-       addl    %edx,%r11d
-       rorl    $2,%r14d
-       addl    %edi,%edx
-       psrld   $7,%xmm6
-       movl    %r11d,%r13d
-       addl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%edx
-       pshufd  $250,%xmm2,%xmm7
-       movl    %eax,%r12d
-       pslld   $14,%xmm5
-       xorl    %r11d,%r13d
-       pxor    %xmm6,%xmm4
-       rorl    $9,%r14d
-       xorl    %ebx,%r12d
-       psrld   $11,%xmm6
-       rorl    $5,%r13d
-       xorl    %edx,%r14d
-       pxor    %xmm5,%xmm4
-       andl    %r11d,%r12d
-       xorl    %r11d,%r13d
-       pslld   $11,%xmm5
-       addl    52(%rsp),%ecx
-       pxor    %xmm6,%xmm4
-       movl    %edx,%edi
-       rorl    $11,%r14d
-       xorl    %ebx,%r12d
-       movdqa  %xmm7,%xmm6
-       xorl    %r8d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%ecx
-       pxor    %xmm5,%xmm4
-       andl    %edi,%r15d
-       xorl    %edx,%r14d
-       psrld   $10,%xmm7
-       addl    %r13d,%ecx
-       xorl    %r8d,%r15d
-       paddd   %xmm4,%xmm3
-       addl    %ecx,%r10d
-       rorl    $2,%r14d
-       addl    %r15d,%ecx
-       movl    %r10d,%r13d
-       psrlq   $17,%xmm6
-       addl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ecx
-       movl    %r11d,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %r10d,%r13d
-       rorl    $9,%r14d
-       psrlq   $2,%xmm6
-       xorl    %eax,%r12d
-       rorl    $5,%r13d
-       xorl    %ecx,%r14d
-       pxor    %xmm6,%xmm7
-       andl    %r10d,%r12d
-       xorl    %r10d,%r13d
-       addl    56(%rsp),%ebx
-.byte  102,65,15,56,0,248
-       movl    %ecx,%r15d
-       rorl    $11,%r14d
-       xorl    %eax,%r12d
-       xorl    %edx,%r15d
-       rorl    $6,%r13d
-       paddd   %xmm7,%xmm3
-       addl    %r12d,%ebx
-       pshufd  $80,%xmm3,%xmm7
-       andl    %r15d,%edi
-       xorl    %ecx,%r14d
-       addl    %r13d,%ebx
-       xorl    %edx,%edi
-       movdqa  %xmm7,%xmm6
-       addl    %ebx,%r9d
-       rorl    $2,%r14d
-       addl    %edi,%ebx
-       psrld   $10,%xmm7
-       movl    %r9d,%r13d
-       psrlq   $17,%xmm6
-       addl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ebx
-       movl    %r10d,%r12d
-       pxor    %xmm6,%xmm7
-       xorl    %r9d,%r13d
-       rorl    $9,%r14d
-       xorl    %r11d,%r12d
-       psrlq   $2,%xmm6
-       rorl    $5,%r13d
-       xorl    %ebx,%r14d
-       andl    %r9d,%r12d
-       xorl    %r9d,%r13d
-       pxor    %xmm6,%xmm7
-       addl    60(%rsp),%eax
-       movl    %ebx,%edi
-       movdqa  96(%rbp),%xmm6
-       rorl    $11,%r14d
-       xorl    %r11d,%r12d
-.byte  102,65,15,56,0,249
-       xorl    %ecx,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%eax
-       andl    %edi,%r15d
-       xorl    %ebx,%r14d
-       paddd   %xmm7,%xmm3
-       addl    %r13d,%eax
-       xorl    %ecx,%r15d
-       addl    %eax,%r8d
-       paddd   %xmm3,%xmm6
-       rorl    $2,%r14d
-       addl    %r15d,%eax
-       movl    %r8d,%r13d
-       addl    %eax,%r14d
-       movdqa  %xmm6,48(%rsp)
-       cmpb    $0,131(%rbp)
-       jne     L$ssse3_00_47
-       rorl    $14,%r13d
-       movl    %r14d,%eax
-       movl    %r9d,%r12d
-       xorl    %r8d,%r13d
-       rorl    $9,%r14d
-       xorl    %r10d,%r12d
-       rorl    $5,%r13d
-       xorl    %eax,%r14d
-       andl    %r8d,%r12d
-       xorl    %r8d,%r13d
-       addl    0(%rsp),%r11d
-       movl    %eax,%r15d
-       rorl    $11,%r14d
-       xorl    %r10d,%r12d
-       xorl    %ebx,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%r11d
-       andl    %r15d,%edi
-       xorl    %eax,%r14d
-       addl    %r13d,%r11d
-       xorl    %ebx,%edi
-       addl    %r11d,%edx
-       rorl    $2,%r14d
-       addl    %edi,%r11d
-       movl    %edx,%r13d
-       addl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r11d
-       movl    %r8d,%r12d
-       xorl    %edx,%r13d
-       rorl    $9,%r14d
-       xorl    %r9d,%r12d
-       rorl    $5,%r13d
-       xorl    %r11d,%r14d
-       andl    %edx,%r12d
-       xorl    %edx,%r13d
-       addl    4(%rsp),%r10d
-       movl    %r11d,%edi
-       rorl    $11,%r14d
-       xorl    %r9d,%r12d
-       xorl    %eax,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r10d
-       andl    %edi,%r15d
-       xorl    %r11d,%r14d
-       addl    %r13d,%r10d
-       xorl    %eax,%r15d
-       addl    %r10d,%ecx
-       rorl    $2,%r14d
-       addl    %r15d,%r10d
-       movl    %ecx,%r13d
-       addl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r10d
-       movl    %edx,%r12d
-       xorl    %ecx,%r13d
-       rorl    $9,%r14d
-       xorl    %r8d,%r12d
-       rorl    $5,%r13d
-       xorl    %r10d,%r14d
-       andl    %ecx,%r12d
-       xorl    %ecx,%r13d
-       addl    8(%rsp),%r9d
-       movl    %r10d,%r15d
-       rorl    $11,%r14d
-       xorl    %r8d,%r12d
-       xorl    %r11d,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%r9d
-       andl    %r15d,%edi
-       xorl    %r10d,%r14d
-       addl    %r13d,%r9d
-       xorl    %r11d,%edi
-       addl    %r9d,%ebx
-       rorl    $2,%r14d
-       addl    %edi,%r9d
-       movl    %ebx,%r13d
-       addl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r9d
-       movl    %ecx,%r12d
-       xorl    %ebx,%r13d
-       rorl    $9,%r14d
-       xorl    %edx,%r12d
-       rorl    $5,%r13d
-       xorl    %r9d,%r14d
-       andl    %ebx,%r12d
-       xorl    %ebx,%r13d
-       addl    12(%rsp),%r8d
-       movl    %r9d,%edi
-       rorl    $11,%r14d
-       xorl    %edx,%r12d
-       xorl    %r10d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r8d
-       andl    %edi,%r15d
-       xorl    %r9d,%r14d
-       addl    %r13d,%r8d
-       xorl    %r10d,%r15d
-       addl    %r8d,%eax
-       rorl    $2,%r14d
-       addl    %r15d,%r8d
-       movl    %eax,%r13d
-       addl    %r8d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r8d
-       movl    %ebx,%r12d
-       xorl    %eax,%r13d
-       rorl    $9,%r14d
-       xorl    %ecx,%r12d
-       rorl    $5,%r13d
-       xorl    %r8d,%r14d
-       andl    %eax,%r12d
-       xorl    %eax,%r13d
-       addl    16(%rsp),%edx
-       movl    %r8d,%r15d
-       rorl    $11,%r14d
-       xorl    %ecx,%r12d
-       xorl    %r9d,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%edx
-       andl    %r15d,%edi
-       xorl    %r8d,%r14d
-       addl    %r13d,%edx
-       xorl    %r9d,%edi
-       addl    %edx,%r11d
-       rorl    $2,%r14d
-       addl    %edi,%edx
-       movl    %r11d,%r13d
-       addl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%edx
-       movl    %eax,%r12d
-       xorl    %r11d,%r13d
-       rorl    $9,%r14d
-       xorl    %ebx,%r12d
-       rorl    $5,%r13d
-       xorl    %edx,%r14d
-       andl    %r11d,%r12d
-       xorl    %r11d,%r13d
-       addl    20(%rsp),%ecx
-       movl    %edx,%edi
-       rorl    $11,%r14d
-       xorl    %ebx,%r12d
-       xorl    %r8d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%ecx
-       andl    %edi,%r15d
-       xorl    %edx,%r14d
-       addl    %r13d,%ecx
-       xorl    %r8d,%r15d
-       addl    %ecx,%r10d
-       rorl    $2,%r14d
-       addl    %r15d,%ecx
-       movl    %r10d,%r13d
-       addl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ecx
-       movl    %r11d,%r12d
-       xorl    %r10d,%r13d
-       rorl    $9,%r14d
-       xorl    %eax,%r12d
-       rorl    $5,%r13d
-       xorl    %ecx,%r14d
-       andl    %r10d,%r12d
-       xorl    %r10d,%r13d
-       addl    24(%rsp),%ebx
-       movl    %ecx,%r15d
-       rorl    $11,%r14d
-       xorl    %eax,%r12d
-       xorl    %edx,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%ebx
-       andl    %r15d,%edi
-       xorl    %ecx,%r14d
-       addl    %r13d,%ebx
-       xorl    %edx,%edi
-       addl    %ebx,%r9d
-       rorl    $2,%r14d
-       addl    %edi,%ebx
-       movl    %r9d,%r13d
-       addl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ebx
-       movl    %r10d,%r12d
-       xorl    %r9d,%r13d
-       rorl    $9,%r14d
-       xorl    %r11d,%r12d
-       rorl    $5,%r13d
-       xorl    %ebx,%r14d
-       andl    %r9d,%r12d
-       xorl    %r9d,%r13d
-       addl    28(%rsp),%eax
-       movl    %ebx,%edi
-       rorl    $11,%r14d
-       xorl    %r11d,%r12d
-       xorl    %ecx,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%eax
-       andl    %edi,%r15d
-       xorl    %ebx,%r14d
-       addl    %r13d,%eax
-       xorl    %ecx,%r15d
-       addl    %eax,%r8d
-       rorl    $2,%r14d
-       addl    %r15d,%eax
-       movl    %r8d,%r13d
-       addl    %eax,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%eax
-       movl    %r9d,%r12d
-       xorl    %r8d,%r13d
-       rorl    $9,%r14d
-       xorl    %r10d,%r12d
-       rorl    $5,%r13d
-       xorl    %eax,%r14d
-       andl    %r8d,%r12d
-       xorl    %r8d,%r13d
-       addl    32(%rsp),%r11d
-       movl    %eax,%r15d
-       rorl    $11,%r14d
-       xorl    %r10d,%r12d
-       xorl    %ebx,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%r11d
-       andl    %r15d,%edi
-       xorl    %eax,%r14d
-       addl    %r13d,%r11d
-       xorl    %ebx,%edi
-       addl    %r11d,%edx
-       rorl    $2,%r14d
-       addl    %edi,%r11d
-       movl    %edx,%r13d
-       addl    %r11d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r11d
-       movl    %r8d,%r12d
-       xorl    %edx,%r13d
-       rorl    $9,%r14d
-       xorl    %r9d,%r12d
-       rorl    $5,%r13d
-       xorl    %r11d,%r14d
-       andl    %edx,%r12d
-       xorl    %edx,%r13d
-       addl    36(%rsp),%r10d
-       movl    %r11d,%edi
-       rorl    $11,%r14d
-       xorl    %r9d,%r12d
-       xorl    %eax,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r10d
-       andl    %edi,%r15d
-       xorl    %r11d,%r14d
-       addl    %r13d,%r10d
-       xorl    %eax,%r15d
-       addl    %r10d,%ecx
-       rorl    $2,%r14d
-       addl    %r15d,%r10d
-       movl    %ecx,%r13d
-       addl    %r10d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r10d
-       movl    %edx,%r12d
-       xorl    %ecx,%r13d
-       rorl    $9,%r14d
-       xorl    %r8d,%r12d
-       rorl    $5,%r13d
-       xorl    %r10d,%r14d
-       andl    %ecx,%r12d
-       xorl    %ecx,%r13d
-       addl    40(%rsp),%r9d
-       movl    %r10d,%r15d
-       rorl    $11,%r14d
-       xorl    %r8d,%r12d
-       xorl    %r11d,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%r9d
-       andl    %r15d,%edi
-       xorl    %r10d,%r14d
-       addl    %r13d,%r9d
-       xorl    %r11d,%edi
-       addl    %r9d,%ebx
-       rorl    $2,%r14d
-       addl    %edi,%r9d
-       movl    %ebx,%r13d
-       addl    %r9d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r9d
-       movl    %ecx,%r12d
-       xorl    %ebx,%r13d
-       rorl    $9,%r14d
-       xorl    %edx,%r12d
-       rorl    $5,%r13d
-       xorl    %r9d,%r14d
-       andl    %ebx,%r12d
-       xorl    %ebx,%r13d
-       addl    44(%rsp),%r8d
-       movl    %r9d,%edi
-       rorl    $11,%r14d
-       xorl    %edx,%r12d
-       xorl    %r10d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%r8d
-       andl    %edi,%r15d
-       xorl    %r9d,%r14d
-       addl    %r13d,%r8d
-       xorl    %r10d,%r15d
-       addl    %r8d,%eax
-       rorl    $2,%r14d
-       addl    %r15d,%r8d
-       movl    %eax,%r13d
-       addl    %r8d,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%r8d
-       movl    %ebx,%r12d
-       xorl    %eax,%r13d
-       rorl    $9,%r14d
-       xorl    %ecx,%r12d
-       rorl    $5,%r13d
-       xorl    %r8d,%r14d
-       andl    %eax,%r12d
-       xorl    %eax,%r13d
-       addl    48(%rsp),%edx
-       movl    %r8d,%r15d
-       rorl    $11,%r14d
-       xorl    %ecx,%r12d
-       xorl    %r9d,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%edx
-       andl    %r15d,%edi
-       xorl    %r8d,%r14d
-       addl    %r13d,%edx
-       xorl    %r9d,%edi
-       addl    %edx,%r11d
-       rorl    $2,%r14d
-       addl    %edi,%edx
-       movl    %r11d,%r13d
-       addl    %edx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%edx
-       movl    %eax,%r12d
-       xorl    %r11d,%r13d
-       rorl    $9,%r14d
-       xorl    %ebx,%r12d
-       rorl    $5,%r13d
-       xorl    %edx,%r14d
-       andl    %r11d,%r12d
-       xorl    %r11d,%r13d
-       addl    52(%rsp),%ecx
-       movl    %edx,%edi
-       rorl    $11,%r14d
-       xorl    %ebx,%r12d
-       xorl    %r8d,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%ecx
-       andl    %edi,%r15d
-       xorl    %edx,%r14d
-       addl    %r13d,%ecx
-       xorl    %r8d,%r15d
-       addl    %ecx,%r10d
-       rorl    $2,%r14d
-       addl    %r15d,%ecx
-       movl    %r10d,%r13d
-       addl    %ecx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ecx
-       movl    %r11d,%r12d
-       xorl    %r10d,%r13d
-       rorl    $9,%r14d
-       xorl    %eax,%r12d
-       rorl    $5,%r13d
-       xorl    %ecx,%r14d
-       andl    %r10d,%r12d
-       xorl    %r10d,%r13d
-       addl    56(%rsp),%ebx
-       movl    %ecx,%r15d
-       rorl    $11,%r14d
-       xorl    %eax,%r12d
-       xorl    %edx,%r15d
-       rorl    $6,%r13d
-       addl    %r12d,%ebx
-       andl    %r15d,%edi
-       xorl    %ecx,%r14d
-       addl    %r13d,%ebx
-       xorl    %edx,%edi
-       addl    %ebx,%r9d
-       rorl    $2,%r14d
-       addl    %edi,%ebx
-       movl    %r9d,%r13d
-       addl    %ebx,%r14d
-       rorl    $14,%r13d
-       movl    %r14d,%ebx
-       movl    %r10d,%r12d
-       xorl    %r9d,%r13d
-       rorl    $9,%r14d
-       xorl    %r11d,%r12d
-       rorl    $5,%r13d
-       xorl    %ebx,%r14d
-       andl    %r9d,%r12d
-       xorl    %r9d,%r13d
-       addl    60(%rsp),%eax
-       movl    %ebx,%edi
-       rorl    $11,%r14d
-       xorl    %r11d,%r12d
-       xorl    %ecx,%edi
-       rorl    $6,%r13d
-       addl    %r12d,%eax
-       andl    %edi,%r15d
-       xorl    %ebx,%r14d
-       addl    %r13d,%eax
-       xorl    %ecx,%r15d
-       addl    %eax,%r8d
-       rorl    $2,%r14d
-       addl    %r15d,%eax
-       movl    %r8d,%r13d
-       addl    %eax,%r14d
-       movq    64+0(%rsp),%rdi
-       movl    %r14d,%eax
-
-       addl    0(%rdi),%eax
-       leaq    64(%rsi),%rsi
-       addl    4(%rdi),%ebx
-       addl    8(%rdi),%ecx
-       addl    12(%rdi),%edx
-       addl    16(%rdi),%r8d
-       addl    20(%rdi),%r9d
-       addl    24(%rdi),%r10d
-       addl    28(%rdi),%r11d
-
-       cmpq    64+16(%rsp),%rsi
-
-       movl    %eax,0(%rdi)
-       movl    %ebx,4(%rdi)
-       movl    %ecx,8(%rdi)
-       movl    %edx,12(%rdi)
-       movl    %r8d,16(%rdi)
-       movl    %r9d,20(%rdi)
-       movl    %r10d,24(%rdi)
-       movl    %r11d,28(%rdi)
-       jb      L$loop_ssse3
-
-       movq    64+24(%rsp),%rsi
-       movq    (%rsi),%r15
-       movq    8(%rsi),%r14
-       movq    16(%rsi),%r13
-       movq    24(%rsi),%r12
-       movq    32(%rsi),%rbp
-       movq    40(%rsi),%rbx
-       leaq    48(%rsi),%rsp
-L$epilogue_ssse3:
+L$xop_00_47:
+       addq    $256,%rbp
+       vpalignr        $8,%xmm0,%xmm1,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%rax
+       vpalignr        $8,%xmm4,%xmm5,%xmm11
+       movq    %r9,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %rax,%r14
+       vpaddq  %xmm11,%xmm0,%xmm0
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    0(%rsp),%r11
+       movq    %rax,%r15
+.byte  143,72,120,195,209,7
+       xorq    %r10,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,223,3
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rbx,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm7,%xmm10
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       vpaddq  %xmm8,%xmm0,%xmm0
+       movq    %rdx,%r13
+       addq    %r11,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%r11
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %r8,%r12
+       rorq    $5,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       vpaddq  %xmm11,%xmm0,%xmm0
+       addq    8(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       rorq    $6,%r14
+       vpaddq  -128(%rbp),%xmm0,%xmm10
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       rorq    $28,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       vmovdqa %xmm10,0(%rsp)
+       vpalignr        $8,%xmm1,%xmm2,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%r10
+       vpalignr        $8,%xmm5,%xmm6,%xmm11
+       movq    %rdx,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %r10,%r14
+       vpaddq  %xmm11,%xmm1,%xmm1
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    16(%rsp),%r9
+       movq    %r10,%r15
+.byte  143,72,120,195,209,7
+       xorq    %r8,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,216,3
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r11,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm0,%xmm10
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       vpaddq  %xmm8,%xmm1,%xmm1
+       movq    %rbx,%r13
+       addq    %r9,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%r9
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %rcx,%r12
+       rorq    $5,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       vpaddq  %xmm11,%xmm1,%xmm1
+       addq    24(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       rorq    $6,%r14
+       vpaddq  -96(%rbp),%xmm1,%xmm10
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       rorq    $28,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       vmovdqa %xmm10,16(%rsp)
+       vpalignr        $8,%xmm2,%xmm3,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%r8
+       vpalignr        $8,%xmm6,%xmm7,%xmm11
+       movq    %rbx,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %r8,%r14
+       vpaddq  %xmm11,%xmm2,%xmm2
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    32(%rsp),%rdx
+       movq    %r8,%r15
+.byte  143,72,120,195,209,7
+       xorq    %rcx,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,217,3
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r9,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm1,%xmm10
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       vpaddq  %xmm8,%xmm2,%xmm2
+       movq    %r11,%r13
+       addq    %rdx,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%rdx
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %rax,%r12
+       rorq    $5,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       vpaddq  %xmm11,%xmm2,%xmm2
+       addq    40(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       rorq    $6,%r14
+       vpaddq  -64(%rbp),%xmm2,%xmm10
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       rorq    $28,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       vmovdqa %xmm10,32(%rsp)
+       vpalignr        $8,%xmm3,%xmm4,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%rcx
+       vpalignr        $8,%xmm7,%xmm0,%xmm11
+       movq    %r11,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %rcx,%r14
+       vpaddq  %xmm11,%xmm3,%xmm3
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    48(%rsp),%rbx
+       movq    %rcx,%r15
+.byte  143,72,120,195,209,7
+       xorq    %rax,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,218,3
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rdx,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm2,%xmm10
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       vpaddq  %xmm8,%xmm3,%xmm3
+       movq    %r9,%r13
+       addq    %rbx,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%rbx
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %r10,%r12
+       rorq    $5,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       vpaddq  %xmm11,%xmm3,%xmm3
+       addq    56(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       rorq    $6,%r14
+       vpaddq  -32(%rbp),%xmm3,%xmm10
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       rorq    $28,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       vmovdqa %xmm10,48(%rsp)
+       vpalignr        $8,%xmm4,%xmm5,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%rax
+       vpalignr        $8,%xmm0,%xmm1,%xmm11
+       movq    %r9,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %rax,%r14
+       vpaddq  %xmm11,%xmm4,%xmm4
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    64(%rsp),%r11
+       movq    %rax,%r15
+.byte  143,72,120,195,209,7
+       xorq    %r10,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,219,3
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rbx,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm3,%xmm10
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       vpaddq  %xmm8,%xmm4,%xmm4
+       movq    %rdx,%r13
+       addq    %r11,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%r11
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %r8,%r12
+       rorq    $5,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       vpaddq  %xmm11,%xmm4,%xmm4
+       addq    72(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       rorq    $6,%r14
+       vpaddq  0(%rbp),%xmm4,%xmm10
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       rorq    $28,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       vmovdqa %xmm10,64(%rsp)
+       vpalignr        $8,%xmm5,%xmm6,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%r10
+       vpalignr        $8,%xmm1,%xmm2,%xmm11
+       movq    %rdx,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %r10,%r14
+       vpaddq  %xmm11,%xmm5,%xmm5
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    80(%rsp),%r9
+       movq    %r10,%r15
+.byte  143,72,120,195,209,7
+       xorq    %r8,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,220,3
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r11,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm4,%xmm10
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       vpaddq  %xmm8,%xmm5,%xmm5
+       movq    %rbx,%r13
+       addq    %r9,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%r9
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %rcx,%r12
+       rorq    $5,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       vpaddq  %xmm11,%xmm5,%xmm5
+       addq    88(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       rorq    $6,%r14
+       vpaddq  32(%rbp),%xmm5,%xmm10
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       rorq    $28,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       vmovdqa %xmm10,80(%rsp)
+       vpalignr        $8,%xmm6,%xmm7,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%r8
+       vpalignr        $8,%xmm2,%xmm3,%xmm11
+       movq    %rbx,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %r8,%r14
+       vpaddq  %xmm11,%xmm6,%xmm6
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    96(%rsp),%rdx
+       movq    %r8,%r15
+.byte  143,72,120,195,209,7
+       xorq    %rcx,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,221,3
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r9,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm5,%xmm10
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       vpaddq  %xmm8,%xmm6,%xmm6
+       movq    %r11,%r13
+       addq    %rdx,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%rdx
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %rax,%r12
+       rorq    $5,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       vpaddq  %xmm11,%xmm6,%xmm6
+       addq    104(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       rorq    $6,%r14
+       vpaddq  64(%rbp),%xmm6,%xmm10
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       rorq    $28,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       vmovdqa %xmm10,96(%rsp)
+       vpalignr        $8,%xmm7,%xmm0,%xmm8
+       rorq    $23,%r13
+       movq    %r14,%rcx
+       vpalignr        $8,%xmm3,%xmm4,%xmm11
+       movq    %r11,%r12
+       rorq    $5,%r14
+.byte  143,72,120,195,200,56
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       vpsrlq  $7,%xmm8,%xmm8
+       rorq    $4,%r13
+       xorq    %rcx,%r14
+       vpaddq  %xmm11,%xmm7,%xmm7
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    112(%rsp),%rbx
+       movq    %rcx,%r15
+.byte  143,72,120,195,209,7
+       xorq    %rax,%r12
+       rorq    $6,%r14
+       vpxor   %xmm9,%xmm8,%xmm8
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+.byte  143,104,120,195,222,3
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rdx,%rdi
+       rorq    $28,%r14
+       vpsrlq  $6,%xmm6,%xmm10
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       vpaddq  %xmm8,%xmm7,%xmm7
+       movq    %r9,%r13
+       addq    %rbx,%r14
+.byte  143,72,120,195,203,42
+       rorq    $23,%r13
+       movq    %r14,%rbx
+       vpxor   %xmm10,%xmm11,%xmm11
+       movq    %r10,%r12
+       rorq    $5,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       vpxor   %xmm9,%xmm11,%xmm11
+       rorq    $4,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       vpaddq  %xmm11,%xmm7,%xmm7
+       addq    120(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       rorq    $6,%r14
+       vpaddq  96(%rbp),%xmm7,%xmm10
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       rorq    $28,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       vmovdqa %xmm10,112(%rsp)
+       cmpb    $0,135(%rbp)
+       jne     L$xop_00_47
+       rorq    $23,%r13
+       movq    %r14,%rax
+       movq    %r9,%r12
+       rorq    $5,%r14
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       rorq    $4,%r13
+       xorq    %rax,%r14
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    0(%rsp),%r11
+       movq    %rax,%r15
+       xorq    %r10,%r12
+       rorq    $6,%r14
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       xorq    %rbx,%rdi
+       rorq    $28,%r14
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       rorq    $23,%r13
+       movq    %r14,%r11
+       movq    %r8,%r12
+       rorq    $5,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       rorq    $4,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       addq    8(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       rorq    $6,%r14
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       rorq    $28,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       rorq    $23,%r13
+       movq    %r14,%r10
+       movq    %rdx,%r12
+       rorq    $5,%r14
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       rorq    $4,%r13
+       xorq    %r10,%r14
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    16(%rsp),%r9
+       movq    %r10,%r15
+       xorq    %r8,%r12
+       rorq    $6,%r14
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       xorq    %r11,%rdi
+       rorq    $28,%r14
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       rorq    $23,%r13
+       movq    %r14,%r9
+       movq    %rcx,%r12
+       rorq    $5,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       rorq    $4,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       addq    24(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       rorq    $6,%r14
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       rorq    $28,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       rorq    $23,%r13
+       movq    %r14,%r8
+       movq    %rbx,%r12
+       rorq    $5,%r14
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       rorq    $4,%r13
+       xorq    %r8,%r14
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    32(%rsp),%rdx
+       movq    %r8,%r15
+       xorq    %rcx,%r12
+       rorq    $6,%r14
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       xorq    %r9,%rdi
+       rorq    $28,%r14
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rdx
+       movq    %rax,%r12
+       rorq    $5,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       rorq    $4,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       addq    40(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       rorq    $6,%r14
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       rorq    $28,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rcx
+       movq    %r11,%r12
+       rorq    $5,%r14
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       rorq    $4,%r13
+       xorq    %rcx,%r14
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    48(%rsp),%rbx
+       movq    %rcx,%r15
+       xorq    %rax,%r12
+       rorq    $6,%r14
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       xorq    %rdx,%rdi
+       rorq    $28,%r14
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rbx
+       movq    %r10,%r12
+       rorq    $5,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       rorq    $4,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       addq    56(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       rorq    $6,%r14
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       rorq    $28,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       rorq    $23,%r13
+       movq    %r14,%rax
+       movq    %r9,%r12
+       rorq    $5,%r14
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       rorq    $4,%r13
+       xorq    %rax,%r14
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    64(%rsp),%r11
+       movq    %rax,%r15
+       xorq    %r10,%r12
+       rorq    $6,%r14
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       xorq    %rbx,%rdi
+       rorq    $28,%r14
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       rorq    $23,%r13
+       movq    %r14,%r11
+       movq    %r8,%r12
+       rorq    $5,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       rorq    $4,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       addq    72(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       rorq    $6,%r14
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       rorq    $28,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       rorq    $23,%r13
+       movq    %r14,%r10
+       movq    %rdx,%r12
+       rorq    $5,%r14
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       rorq    $4,%r13
+       xorq    %r10,%r14
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    80(%rsp),%r9
+       movq    %r10,%r15
+       xorq    %r8,%r12
+       rorq    $6,%r14
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       xorq    %r11,%rdi
+       rorq    $28,%r14
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       rorq    $23,%r13
+       movq    %r14,%r9
+       movq    %rcx,%r12
+       rorq    $5,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       rorq    $4,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       addq    88(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       rorq    $6,%r14
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       rorq    $28,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       rorq    $23,%r13
+       movq    %r14,%r8
+       movq    %rbx,%r12
+       rorq    $5,%r14
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       rorq    $4,%r13
+       xorq    %r8,%r14
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    96(%rsp),%rdx
+       movq    %r8,%r15
+       xorq    %rcx,%r12
+       rorq    $6,%r14
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       xorq    %r9,%rdi
+       rorq    $28,%r14
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rdx
+       movq    %rax,%r12
+       rorq    $5,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       rorq    $4,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       addq    104(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       rorq    $6,%r14
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       rorq    $28,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rcx
+       movq    %r11,%r12
+       rorq    $5,%r14
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       rorq    $4,%r13
+       xorq    %rcx,%r14
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    112(%rsp),%rbx
+       movq    %rcx,%r15
+       xorq    %rax,%r12
+       rorq    $6,%r14
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       rorq    $14,%r13
+       andq    %r15,%rdi
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       xorq    %rdx,%rdi
+       rorq    $28,%r14
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       rorq    $23,%r13
+       movq    %r14,%rbx
+       movq    %r10,%r12
+       rorq    $5,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       rorq    $4,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       addq    120(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       rorq    $6,%r14
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       rorq    $14,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       rorq    $28,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       movq    128+0(%rsp),%rdi
+       movq    %r14,%rax
+
+       addq    0(%rdi),%rax
+       leaq    128(%rsi),%rsi
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       addq    48(%rdi),%r10
+       addq    56(%rdi),%r11
+
+       cmpq    128+16(%rsp),%rsi
+
+       movq    %rax,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
+       jb      L$loop_xop
+
+       movq    152(%rsp),%rsi
+
+       vzeroupper
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
+L$epilogue_xop:
+       .byte   0xf3,0xc3
+
+
+
+.p2align       6
+sha512_block_data_order_avx:
+
+L$avx_shortcut:
+       movq    %rsp,%rax
+
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       pushq   %r15
+
+       shlq    $4,%rdx
+       subq    $160,%rsp
+       leaq    (%rsi,%rdx,8),%rdx
+       andq    $-64,%rsp
+       movq    %rdi,128+0(%rsp)
+       movq    %rsi,128+8(%rsp)
+       movq    %rdx,128+16(%rsp)
+       movq    %rax,152(%rsp)
+
+L$prologue_avx:
+
+       vzeroupper
+       movq    0(%rdi),%rax
+       movq    8(%rdi),%rbx
+       movq    16(%rdi),%rcx
+       movq    24(%rdi),%rdx
+       movq    32(%rdi),%r8
+       movq    40(%rdi),%r9
+       movq    48(%rdi),%r10
+       movq    56(%rdi),%r11
+       jmp     L$loop_avx
+.p2align       4
+L$loop_avx:
+       vmovdqa K512+1280(%rip),%xmm11
+       vmovdqu 0(%rsi),%xmm0
+       leaq    K512+128(%rip),%rbp
+       vmovdqu 16(%rsi),%xmm1
+       vmovdqu 32(%rsi),%xmm2
+       vpshufb %xmm11,%xmm0,%xmm0
+       vmovdqu 48(%rsi),%xmm3
+       vpshufb %xmm11,%xmm1,%xmm1
+       vmovdqu 64(%rsi),%xmm4
+       vpshufb %xmm11,%xmm2,%xmm2
+       vmovdqu 80(%rsi),%xmm5
+       vpshufb %xmm11,%xmm3,%xmm3
+       vmovdqu 96(%rsi),%xmm6
+       vpshufb %xmm11,%xmm4,%xmm4
+       vmovdqu 112(%rsi),%xmm7
+       vpshufb %xmm11,%xmm5,%xmm5
+       vpaddq  -128(%rbp),%xmm0,%xmm8
+       vpshufb %xmm11,%xmm6,%xmm6
+       vpaddq  -96(%rbp),%xmm1,%xmm9
+       vpshufb %xmm11,%xmm7,%xmm7
+       vpaddq  -64(%rbp),%xmm2,%xmm10
+       vpaddq  -32(%rbp),%xmm3,%xmm11
+       vmovdqa %xmm8,0(%rsp)
+       vpaddq  0(%rbp),%xmm4,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       vpaddq  32(%rbp),%xmm5,%xmm9
+       vmovdqa %xmm10,32(%rsp)
+       vpaddq  64(%rbp),%xmm6,%xmm10
+       vmovdqa %xmm11,48(%rsp)
+       vpaddq  96(%rbp),%xmm7,%xmm11
+       vmovdqa %xmm8,64(%rsp)
+       movq    %rax,%r14
+       vmovdqa %xmm9,80(%rsp)
+       movq    %rbx,%rdi
+       vmovdqa %xmm10,96(%rsp)
+       xorq    %rcx,%rdi
+       vmovdqa %xmm11,112(%rsp)
+       movq    %r8,%r13
+       jmp     L$avx_00_47
+
+.p2align       4
+L$avx_00_47:
+       addq    $256,%rbp
+       vpalignr        $8,%xmm0,%xmm1,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rax
+       vpalignr        $8,%xmm4,%xmm5,%xmm11
+       movq    %r9,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       vpaddq  %xmm11,%xmm0,%xmm0
+       shrdq   $4,%r13,%r13
+       xorq    %rax,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    0(%rsp),%r11
+       movq    %rax,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %r10,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rbx,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm7,%xmm11
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       vpsllq  $3,%xmm7,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r11
+       vpaddq  %xmm8,%xmm0,%xmm0
+       movq    %r8,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm7,%xmm9
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %r11,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    8(%rsp),%r10
+       movq    %r11,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %r9,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm0,%xmm0
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       vpaddq  -128(%rbp),%xmm0,%xmm10
+       xorq    %rax,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       vmovdqa %xmm10,0(%rsp)
+       vpalignr        $8,%xmm1,%xmm2,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r10
+       vpalignr        $8,%xmm5,%xmm6,%xmm11
+       movq    %rdx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       vpaddq  %xmm11,%xmm1,%xmm1
+       shrdq   $4,%r13,%r13
+       xorq    %r10,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    16(%rsp),%r9
+       movq    %r10,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %r8,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r11,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm0,%xmm11
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       vpsllq  $3,%xmm0,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r9
+       vpaddq  %xmm8,%xmm1,%xmm1
+       movq    %rcx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm0,%xmm9
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %r9,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    24(%rsp),%r8
+       movq    %r9,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %rdx,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm1,%xmm1
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       vpaddq  -96(%rbp),%xmm1,%xmm10
+       xorq    %r10,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       vmovdqa %xmm10,16(%rsp)
+       vpalignr        $8,%xmm2,%xmm3,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r8
+       vpalignr        $8,%xmm6,%xmm7,%xmm11
+       movq    %rbx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       vpaddq  %xmm11,%xmm2,%xmm2
+       shrdq   $4,%r13,%r13
+       xorq    %r8,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    32(%rsp),%rdx
+       movq    %r8,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %rcx,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r9,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm1,%xmm11
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       vpsllq  $3,%xmm1,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rdx
+       vpaddq  %xmm8,%xmm2,%xmm2
+       movq    %rax,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm1,%xmm9
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %rdx,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    40(%rsp),%rcx
+       movq    %rdx,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %rbx,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm2,%xmm2
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       vpaddq  -64(%rbp),%xmm2,%xmm10
+       xorq    %r8,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       vmovdqa %xmm10,32(%rsp)
+       vpalignr        $8,%xmm3,%xmm4,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rcx
+       vpalignr        $8,%xmm7,%xmm0,%xmm11
+       movq    %r11,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       vpaddq  %xmm11,%xmm3,%xmm3
+       shrdq   $4,%r13,%r13
+       xorq    %rcx,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    48(%rsp),%rbx
+       movq    %rcx,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %rax,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rdx,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm2,%xmm11
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       vpsllq  $3,%xmm2,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rbx
+       vpaddq  %xmm8,%xmm3,%xmm3
+       movq    %r10,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm2,%xmm9
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %rbx,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    56(%rsp),%rax
+       movq    %rbx,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %r11,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm3,%xmm3
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       vpaddq  -32(%rbp),%xmm3,%xmm10
+       xorq    %rcx,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       vmovdqa %xmm10,48(%rsp)
+       vpalignr        $8,%xmm4,%xmm5,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rax
+       vpalignr        $8,%xmm0,%xmm1,%xmm11
+       movq    %r9,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       vpaddq  %xmm11,%xmm4,%xmm4
+       shrdq   $4,%r13,%r13
+       xorq    %rax,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    64(%rsp),%r11
+       movq    %rax,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %r10,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rbx,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm3,%xmm11
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       vpsllq  $3,%xmm3,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r11
+       vpaddq  %xmm8,%xmm4,%xmm4
+       movq    %r8,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm3,%xmm9
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %r11,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    72(%rsp),%r10
+       movq    %r11,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %r9,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm4,%xmm4
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       vpaddq  0(%rbp),%xmm4,%xmm10
+       xorq    %rax,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       vmovdqa %xmm10,64(%rsp)
+       vpalignr        $8,%xmm5,%xmm6,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r10
+       vpalignr        $8,%xmm1,%xmm2,%xmm11
+       movq    %rdx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       vpaddq  %xmm11,%xmm5,%xmm5
+       shrdq   $4,%r13,%r13
+       xorq    %r10,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    80(%rsp),%r9
+       movq    %r10,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %r8,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r11,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm4,%xmm11
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       vpsllq  $3,%xmm4,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r9
+       vpaddq  %xmm8,%xmm5,%xmm5
+       movq    %rcx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm4,%xmm9
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %r9,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    88(%rsp),%r8
+       movq    %r9,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %rdx,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm5,%xmm5
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       vpaddq  32(%rbp),%xmm5,%xmm10
+       xorq    %r10,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       vmovdqa %xmm10,80(%rsp)
+       vpalignr        $8,%xmm6,%xmm7,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r8
+       vpalignr        $8,%xmm2,%xmm3,%xmm11
+       movq    %rbx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       vpaddq  %xmm11,%xmm6,%xmm6
+       shrdq   $4,%r13,%r13
+       xorq    %r8,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    96(%rsp),%rdx
+       movq    %r8,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %rcx,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r9,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm5,%xmm11
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       vpsllq  $3,%xmm5,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rdx
+       vpaddq  %xmm8,%xmm6,%xmm6
+       movq    %rax,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm5,%xmm9
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %rdx,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    104(%rsp),%rcx
+       movq    %rdx,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %rbx,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm6,%xmm6
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       vpaddq  64(%rbp),%xmm6,%xmm10
+       xorq    %r8,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       vmovdqa %xmm10,96(%rsp)
+       vpalignr        $8,%xmm7,%xmm0,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rcx
+       vpalignr        $8,%xmm3,%xmm4,%xmm11
+       movq    %r11,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       vpaddq  %xmm11,%xmm7,%xmm7
+       shrdq   $4,%r13,%r13
+       xorq    %rcx,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    112(%rsp),%rbx
+       movq    %rcx,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %rax,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rdx,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm6,%xmm11
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       vpsllq  $3,%xmm6,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rbx
+       vpaddq  %xmm8,%xmm7,%xmm7
+       movq    %r10,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm6,%xmm9
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %rbx,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    120(%rsp),%rax
+       movq    %rbx,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %r11,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm7,%xmm7
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       vpaddq  96(%rbp),%xmm7,%xmm10
+       xorq    %rcx,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       vmovdqa %xmm10,112(%rsp)
+       cmpb    $0,135(%rbp)
+       jne     L$avx_00_47
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rax
+       movq    %r9,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rax,%r14
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    0(%rsp),%r11
+       movq    %rax,%r15
+       xorq    %r10,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       xorq    %rbx,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r11
+       movq    %r8,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       addq    8(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r10
+       movq    %rdx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r10,%r14
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    16(%rsp),%r9
+       movq    %r10,%r15
+       xorq    %r8,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       xorq    %r11,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r9
+       movq    %rcx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       addq    24(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r8
+       movq    %rbx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r8,%r14
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    32(%rsp),%rdx
+       movq    %r8,%r15
+       xorq    %rcx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       xorq    %r9,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rdx
+       movq    %rax,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       addq    40(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rcx
+       movq    %r11,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rcx,%r14
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    48(%rsp),%rbx
+       movq    %rcx,%r15
+       xorq    %rax,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       xorq    %rdx,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rbx
+       movq    %r10,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       addq    56(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rax
+       movq    %r9,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rax,%r14
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    64(%rsp),%r11
+       movq    %rax,%r15
+       xorq    %r10,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       xorq    %rbx,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r11
+       movq    %r8,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       addq    72(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r10
+       movq    %rdx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r10,%r14
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    80(%rsp),%r9
+       movq    %r10,%r15
+       xorq    %r8,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       xorq    %r11,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r9
+       movq    %rcx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       addq    88(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r8
+       movq    %rbx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r8,%r14
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    96(%rsp),%rdx
+       movq    %r8,%r15
+       xorq    %rcx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       xorq    %r9,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rdx
+       movq    %rax,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       addq    104(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rcx
+       movq    %r11,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rcx,%r14
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    112(%rsp),%rbx
+       movq    %rcx,%r15
+       xorq    %rax,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       xorq    %rdx,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rbx
+       movq    %r10,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       addq    120(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       movq    128+0(%rsp),%rdi
+       movq    %r14,%rax
+
+       addq    0(%rdi),%rax
+       leaq    128(%rsi),%rsi
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       addq    48(%rdi),%r10
+       addq    56(%rdi),%r11
+
+       cmpq    128+16(%rsp),%rsi
+
+       movq    %rax,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
+       jb      L$loop_avx
+
+       movq    152(%rsp),%rsi
+
+       vzeroupper
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
+L$epilogue_avx:
+       .byte   0xf3,0xc3
+
+
+
+.p2align       6
+sha512_block_data_order_avx2:
+
+L$avx2_shortcut:
+       movq    %rsp,%rax
+
+       pushq   %rbx
+
+       pushq   %rbp
+
+       pushq   %r12
+
+       pushq   %r13
+
+       pushq   %r14
+
+       pushq   %r15
+
+       subq    $1312,%rsp
+       shlq    $4,%rdx
+       andq    $-2048,%rsp
+       leaq    (%rsi,%rdx,8),%rdx
+       addq    $1152,%rsp
+       movq    %rdi,128+0(%rsp)
+       movq    %rsi,128+8(%rsp)
+       movq    %rdx,128+16(%rsp)
+       movq    %rax,152(%rsp)
+
+L$prologue_avx2:
+
+       vzeroupper
+       subq    $-128,%rsi
+       movq    0(%rdi),%rax
+       movq    %rsi,%r12
+       movq    8(%rdi),%rbx
+       cmpq    %rdx,%rsi
+       movq    16(%rdi),%rcx
+       cmoveq  %rsp,%r12
+       movq    24(%rdi),%rdx
+       movq    32(%rdi),%r8
+       movq    40(%rdi),%r9
+       movq    48(%rdi),%r10
+       movq    56(%rdi),%r11
+       jmp     L$oop_avx2
+.p2align       4
+L$oop_avx2:
+       vmovdqu -128(%rsi),%xmm0
+       vmovdqu -128+16(%rsi),%xmm1
+       vmovdqu -128+32(%rsi),%xmm2
+       leaq    K512+128(%rip),%rbp
+       vmovdqu -128+48(%rsi),%xmm3
+       vmovdqu -128+64(%rsi),%xmm4
+       vmovdqu -128+80(%rsi),%xmm5
+       vmovdqu -128+96(%rsi),%xmm6
+       vmovdqu -128+112(%rsi),%xmm7
+
+       vmovdqa 1152(%rbp),%ymm10
+       vinserti128     $1,(%r12),%ymm0,%ymm0
+       vinserti128     $1,16(%r12),%ymm1,%ymm1
+       vpshufb %ymm10,%ymm0,%ymm0
+       vinserti128     $1,32(%r12),%ymm2,%ymm2
+       vpshufb %ymm10,%ymm1,%ymm1
+       vinserti128     $1,48(%r12),%ymm3,%ymm3
+       vpshufb %ymm10,%ymm2,%ymm2
+       vinserti128     $1,64(%r12),%ymm4,%ymm4
+       vpshufb %ymm10,%ymm3,%ymm3
+       vinserti128     $1,80(%r12),%ymm5,%ymm5
+       vpshufb %ymm10,%ymm4,%ymm4
+       vinserti128     $1,96(%r12),%ymm6,%ymm6
+       vpshufb %ymm10,%ymm5,%ymm5
+       vinserti128     $1,112(%r12),%ymm7,%ymm7
+
+       vpaddq  -128(%rbp),%ymm0,%ymm8
+       vpshufb %ymm10,%ymm6,%ymm6
+       vpaddq  -96(%rbp),%ymm1,%ymm9
+       vpshufb %ymm10,%ymm7,%ymm7
+       vpaddq  -64(%rbp),%ymm2,%ymm10
+       vpaddq  -32(%rbp),%ymm3,%ymm11
+       vmovdqa %ymm8,0(%rsp)
+       vpaddq  0(%rbp),%ymm4,%ymm8
+       vmovdqa %ymm9,32(%rsp)
+       vpaddq  32(%rbp),%ymm5,%ymm9
+       vmovdqa %ymm10,64(%rsp)
+       vpaddq  64(%rbp),%ymm6,%ymm10
+       vmovdqa %ymm11,96(%rsp)
+       leaq    -128(%rsp),%rsp
+       vpaddq  96(%rbp),%ymm7,%ymm11
+       vmovdqa %ymm8,0(%rsp)
+       xorq    %r14,%r14
+       vmovdqa %ymm9,32(%rsp)
+       movq    %rbx,%rdi
+       vmovdqa %ymm10,64(%rsp)
+       xorq    %rcx,%rdi
+       vmovdqa %ymm11,96(%rsp)
+       movq    %r9,%r12
+       addq    $32*8,%rbp
+       jmp     L$avx2_00_47
+
+.p2align       4
+L$avx2_00_47:
+       leaq    -128(%rsp),%rsp
+       vpalignr        $8,%ymm0,%ymm1,%ymm8
+       addq    0+256(%rsp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       vpalignr        $8,%ymm4,%ymm5,%ymm11
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       vpaddq  %ymm11,%ymm0,%ymm0
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       vpsrlq  $6,%ymm7,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       vpsllq  $3,%ymm7,%ymm10
+       vpaddq  %ymm8,%ymm0,%ymm0
+       addq    8+256(%rsp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       vpsrlq  $19,%ymm7,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       vpaddq  %ymm11,%ymm0,%ymm0
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       vpaddq  -128(%rbp),%ymm0,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       vmovdqa %ymm10,0(%rsp)
+       vpalignr        $8,%ymm1,%ymm2,%ymm8
+       addq    32+256(%rsp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       vpalignr        $8,%ymm5,%ymm6,%ymm11
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       vpaddq  %ymm11,%ymm1,%ymm1
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       vpsrlq  $6,%ymm0,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       vpsllq  $3,%ymm0,%ymm10
+       vpaddq  %ymm8,%ymm1,%ymm1
+       addq    40+256(%rsp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       vpsrlq  $19,%ymm0,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       vpaddq  %ymm11,%ymm1,%ymm1
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       vpaddq  -96(%rbp),%ymm1,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       vmovdqa %ymm10,32(%rsp)
+       vpalignr        $8,%ymm2,%ymm3,%ymm8
+       addq    64+256(%rsp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       vpalignr        $8,%ymm6,%ymm7,%ymm11
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       vpaddq  %ymm11,%ymm2,%ymm2
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       vpsrlq  $6,%ymm1,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       vpsllq  $3,%ymm1,%ymm10
+       vpaddq  %ymm8,%ymm2,%ymm2
+       addq    72+256(%rsp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       vpsrlq  $19,%ymm1,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       vpaddq  %ymm11,%ymm2,%ymm2
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       vpaddq  -64(%rbp),%ymm2,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       vmovdqa %ymm10,64(%rsp)
+       vpalignr        $8,%ymm3,%ymm4,%ymm8
+       addq    96+256(%rsp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       vpalignr        $8,%ymm7,%ymm0,%ymm11
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       vpaddq  %ymm11,%ymm3,%ymm3
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       vpsrlq  $6,%ymm2,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       vpsllq  $3,%ymm2,%ymm10
+       vpaddq  %ymm8,%ymm3,%ymm3
+       addq    104+256(%rsp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       vpsrlq  $19,%ymm2,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       vpaddq  %ymm11,%ymm3,%ymm3
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       vpaddq  -32(%rbp),%ymm3,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       vmovdqa %ymm10,96(%rsp)
+       leaq    -128(%rsp),%rsp
+       vpalignr        $8,%ymm4,%ymm5,%ymm8
+       addq    0+256(%rsp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       vpalignr        $8,%ymm0,%ymm1,%ymm11
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       vpaddq  %ymm11,%ymm4,%ymm4
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       vpsrlq  $6,%ymm3,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       vpsllq  $3,%ymm3,%ymm10
+       vpaddq  %ymm8,%ymm4,%ymm4
+       addq    8+256(%rsp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       vpsrlq  $19,%ymm3,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       vpaddq  %ymm11,%ymm4,%ymm4
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       vpaddq  0(%rbp),%ymm4,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       vmovdqa %ymm10,0(%rsp)
+       vpalignr        $8,%ymm5,%ymm6,%ymm8
+       addq    32+256(%rsp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       vpalignr        $8,%ymm1,%ymm2,%ymm11
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       vpaddq  %ymm11,%ymm5,%ymm5
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       vpsrlq  $6,%ymm4,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       vpsllq  $3,%ymm4,%ymm10
+       vpaddq  %ymm8,%ymm5,%ymm5
+       addq    40+256(%rsp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       vpsrlq  $19,%ymm4,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       vpaddq  %ymm11,%ymm5,%ymm5
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       vpaddq  32(%rbp),%ymm5,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       vmovdqa %ymm10,32(%rsp)
+       vpalignr        $8,%ymm6,%ymm7,%ymm8
+       addq    64+256(%rsp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       vpalignr        $8,%ymm2,%ymm3,%ymm11
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       vpaddq  %ymm11,%ymm6,%ymm6
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       vpsrlq  $6,%ymm5,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       vpsllq  $3,%ymm5,%ymm10
+       vpaddq  %ymm8,%ymm6,%ymm6
+       addq    72+256(%rsp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       vpsrlq  $19,%ymm5,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       vpaddq  %ymm11,%ymm6,%ymm6
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       vpaddq  64(%rbp),%ymm6,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       vmovdqa %ymm10,64(%rsp)
+       vpalignr        $8,%ymm7,%ymm0,%ymm8
+       addq    96+256(%rsp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       vpalignr        $8,%ymm3,%ymm4,%ymm11
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       vpaddq  %ymm11,%ymm7,%ymm7
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       vpsrlq  $6,%ymm6,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       vpsllq  $3,%ymm6,%ymm10
+       vpaddq  %ymm8,%ymm7,%ymm7
+       addq    104+256(%rsp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       vpsrlq  $19,%ymm6,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       vpaddq  %ymm11,%ymm7,%ymm7
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       vpaddq  96(%rbp),%ymm7,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       vmovdqa %ymm10,96(%rsp)
+       leaq    256(%rbp),%rbp
+       cmpb    $0,-121(%rbp)
+       jne     L$avx2_00_47
+       addq    0+128(%rsp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       addq    8+128(%rsp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       addq    32+128(%rsp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       addq    40+128(%rsp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       addq    64+128(%rsp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       addq    72+128(%rsp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       addq    96+128(%rsp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       addq    104+128(%rsp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       addq    0(%rsp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       addq    8(%rsp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       addq    32(%rsp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       addq    40(%rsp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       addq    64(%rsp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       addq    72(%rsp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       addq    96(%rsp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       addq    104(%rsp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       movq    1280(%rsp),%rdi
+       addq    %r14,%rax
+
+       leaq    1152(%rsp),%rbp
+
+       addq    0(%rdi),%rax
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       addq    48(%rdi),%r10
+       addq    56(%rdi),%r11
+
+       movq    %rax,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
+
+       cmpq    144(%rbp),%rsi
+       je      L$done_avx2
+
+       xorq    %r14,%r14
+       movq    %rbx,%rdi
+       xorq    %rcx,%rdi
+       movq    %r9,%r12
+       jmp     L$ower_avx2
+.p2align       4
+L$ower_avx2:
+       addq    0+16(%rbp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       addq    8+16(%rbp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       addq    32+16(%rbp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       addq    40+16(%rbp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       addq    64+16(%rbp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       addq    72+16(%rbp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       addq    96+16(%rbp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       addq    104+16(%rbp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       leaq    -128(%rbp),%rbp
+       cmpq    %rsp,%rbp
+       jae     L$ower_avx2
+
+       movq    1280(%rsp),%rdi
+       addq    %r14,%rax
+
+       leaq    1152(%rsp),%rsp
+
+       addq    0(%rdi),%rax
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       leaq    256(%rsi),%rsi
+       addq    48(%rdi),%r10
+       movq    %rsi,%r12
+       addq    56(%rdi),%r11
+       cmpq    128+16(%rsp),%rsi
+
+       movq    %rax,0(%rdi)
+       cmoveq  %rsp,%r12
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
+
+       jbe     L$oop_avx2
+       leaq    (%rsp),%rbp
+
+L$done_avx2:
+       leaq    (%rbp),%rsp
+       movq    152(%rsp),%rsi
+
+       vzeroupper
+       movq    -48(%rsi),%r15
+
+       movq    -40(%rsi),%r14
+
+       movq    -32(%rsi),%r13
+
+       movq    -24(%rsi),%r12
+
+       movq    -16(%rsi),%rbp
+
+       movq    -8(%rsi),%rbx
+
+       leaq    (%rsi),%rsp
+
+L$epilogue_avx2:
        .byte   0xf3,0xc3
 
 
+
index 4724604bdd9bd4e15824473d3b92b653d61e48c1..6bbbfb86410a3909b875435ae7bc3f1f786c1c6c 100644 (file)
@@ -3,9 +3,6 @@
 
 #include <nettle/sha.h>
 
-/* nettle's SHA512 is faster than openssl's */
-#undef ENABLE_SHA512
-
 extern const struct nettle_hash x86_sha1;
 extern const struct nettle_hash x86_sha224;
 extern const struct nettle_hash x86_sha256;