From: Nikos Mavrogiannopoulos Date: Wed, 27 Aug 2014 08:52:13 +0000 (+0200) Subject: updated asm sources X-Git-Tag: gnutls_3_4_0~1020 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=97895066e18abc5689ede9af1a463539ea783e90;p=thirdparty%2Fgnutls.git updated asm sources --- diff --git a/devel/openssl b/devel/openssl index e09ea622bb..34ccd24d0e 160000 --- a/devel/openssl +++ b/devel/openssl @@ -1 +1 @@ -Subproject commit e09ea622bba106e13ab85173c205f354b0f1d481 +Subproject commit 34ccd24d0e609ae26a0b0e0085462f35edc5bcce diff --git a/lib/accelerated/x86/coff/ghash-x86_64.s b/lib/accelerated/x86/coff/ghash-x86_64.s index b1d69911c7..2b4c72acd4 100644 --- a/lib/accelerated/x86/coff/ghash-x86_64.s +++ b/lib/accelerated/x86/coff/ghash-x86_64.s @@ -990,8 +990,8 @@ gcm_ghash_clmul: pxor %xmm11,%xmm12 .byte 102,68,15,58,68,222,0 .byte 102,68,15,58,68,238,17 - xorps %xmm11,%xmm3 .byte 102,68,15,58,68,231,16 + xorps %xmm11,%xmm3 xorps %xmm13,%xmm5 movups 80(%rdx),%xmm7 xorps %xmm12,%xmm4 @@ -1009,8 +1009,8 @@ gcm_ghash_clmul: pshufd $78,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,69,15,58,68,238,17 - xorps %xmm11,%xmm3 .byte 102,68,15,58,68,231,0 + xorps %xmm11,%xmm3 xorps %xmm13,%xmm5 leaq 64(%r8),%r8 @@ -1028,23 +1028,23 @@ gcm_ghash_clmul: xorps %xmm3,%xmm0 movdqu 32(%r8),%xmm3 movdqa %xmm11,%xmm13 - pshufd $78,%xmm11,%xmm12 .byte 102,68,15,58,68,199,16 + pshufd $78,%xmm11,%xmm12 xorps %xmm5,%xmm1 pxor %xmm11,%xmm12 .byte 102,65,15,56,0,218 movups 32(%rdx),%xmm7 -.byte 102,68,15,58,68,218,0 xorps %xmm4,%xmm8 - movdqa %xmm3,%xmm5 +.byte 102,68,15,58,68,218,0 pshufd $78,%xmm3,%xmm4 pxor %xmm0,%xmm8 - pxor %xmm3,%xmm4 + movdqa %xmm3,%xmm5 pxor %xmm1,%xmm8 + pxor %xmm3,%xmm4 movdqa %xmm8,%xmm9 - pslldq $8,%xmm8 .byte 102,68,15,58,68,234,17 + pslldq $8,%xmm8 psrldq $8,%xmm9 pxor %xmm8,%xmm0 movdqa .L7_mask(%rip),%xmm8 @@ -1053,8 +1053,8 @@ gcm_ghash_clmul: pand %xmm0,%xmm8 .byte 102,69,15,56,0,200 -.byte 102,68,15,58,68,231,0 pxor %xmm0,%xmm9 +.byte 102,68,15,58,68,231,0 psllq $57,%xmm9 movdqa %xmm9,%xmm8 pslldq $8,%xmm9 @@ -1081,32 +1081,31 @@ gcm_ghash_clmul: movdqa %xmm11,%xmm13 pxor %xmm12,%xmm4 pshufd $78,%xmm11,%xmm12 - pxor %xmm11,%xmm12 -.byte 102,69,15,58,68,222,0 pxor %xmm9,%xmm0 pxor %xmm8,%xmm1 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm1 .byte 102,69,15,58,68,238,17 xorps %xmm11,%xmm3 - pxor %xmm1,%xmm0 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 .byte 102,68,15,58,68,231,0 xorps %xmm13,%xmm5 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm8 - pxor %xmm0,%xmm8 - leaq 64(%r8),%r8 subq $64,%r9 jnc .Lmod4_loop .Ltail4x: .byte 102,65,15,58,68,199,0 - xorps %xmm12,%xmm4 .byte 102,65,15,58,68,207,17 - xorps %xmm3,%xmm0 .byte 102,68,15,58,68,199,16 + xorps %xmm12,%xmm4 + xorps %xmm3,%xmm0 xorps %xmm5,%xmm1 pxor %xmm0,%xmm1 pxor %xmm4,%xmm8 @@ -1186,13 +1185,13 @@ gcm_ghash_clmul: pxor %xmm3,%xmm0 pxor %xmm5,%xmm1 - movdqu (%r8),%xmm5 + movdqu (%r8),%xmm9 pxor %xmm0,%xmm8 -.byte 102,65,15,56,0,234 +.byte 102,69,15,56,0,202 movdqu 16(%r8),%xmm3 pxor %xmm1,%xmm8 - pxor %xmm5,%xmm1 + pxor %xmm9,%xmm1 pxor %xmm8,%xmm4 .byte 102,65,15,56,0,218 movdqa %xmm4,%xmm8 @@ -1219,9 +1218,9 @@ gcm_ghash_clmul: pxor %xmm8,%xmm1 pxor %xmm5,%xmm4 -.byte 102,15,58,68,234,17 movdqa %xmm0,%xmm9 psrlq $1,%xmm0 +.byte 102,15,58,68,234,17 pxor %xmm9,%xmm1 pxor %xmm0,%xmm9 psrlq $5,%xmm0 @@ -1230,7 +1229,6 @@ gcm_ghash_clmul: psrlq $1,%xmm0 .byte 102,15,58,68,231,0 pxor %xmm1,%xmm0 -.byte 0x66,0x90 subq $32,%r9 ja .Lmod_loop diff --git a/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s index e8136025e8..1b7fe3a51c 100644 --- a/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s +++ b/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s @@ -57,6 +57,8 @@ sha1_block_data_order: movl _gnutls_x86_cpuid_s+8(%rip),%r10d testl $512,%r8d jz .Lialu + testl $536870912,%r10d + jnz _shaext_shortcut jmp _ssse3_shortcut .p2align 4 @@ -1278,6 +1280,195 @@ sha1_block_data_order: movq 16(%rsp),%rsi .byte 0xf3,0xc3 .LSEH_end_sha1_block_data_order: +.def sha1_block_data_order_shaext; .scl 3; .type 32; .endef +.p2align 5 +sha1_block_data_order_shaext: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_sha1_block_data_order_shaext: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + +_shaext_shortcut: + leaq -72(%rsp),%rsp + movaps %xmm6,-8-64(%rax) + movaps %xmm7,-8-48(%rax) + movaps %xmm8,-8-32(%rax) + movaps %xmm9,-8-16(%rax) +.Lprologue_shaext: + movdqu (%rdi),%xmm0 + movd 16(%rdi),%xmm1 + movdqa K_XX_XX+160(%rip),%xmm3 + + movdqu (%rsi),%xmm4 + pshufd $27,%xmm0,%xmm0 + movdqu 16(%rsi),%xmm5 + pshufd $27,%xmm1,%xmm1 + movdqu 32(%rsi),%xmm6 +.byte 102,15,56,0,227 + movdqu 48(%rsi),%xmm7 +.byte 102,15,56,0,235 +.byte 102,15,56,0,243 + movdqa %xmm1,%xmm9 +.byte 102,15,56,0,251 + jmp .Loop_shaext + +.p2align 4 +.Loop_shaext: + decq %rdx + leaq 64(%rsi),%rax + paddd %xmm4,%xmm1 + cmovneq %rax,%rsi + movdqa %xmm0,%xmm8 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 + movdqu (%rsi),%xmm4 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,213 + movdqu 16(%rsi),%xmm5 +.byte 102,15,56,0,227 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,206 + movdqu 32(%rsi),%xmm6 +.byte 102,15,56,0,235 + + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,215 + movdqu 48(%rsi),%xmm7 +.byte 102,15,56,0,243 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 65,15,56,200,201 +.byte 102,15,56,0,251 + + paddd %xmm8,%xmm0 + movdqa %xmm1,%xmm9 + + jnz .Loop_shaext + + pshufd $27,%xmm0,%xmm0 + pshufd $27,%xmm1,%xmm1 + movdqu %xmm0,(%rdi) + movd %xmm1,16(%rdi) + movaps -8-64(%rax),%xmm6 + movaps -8-48(%rax),%xmm7 + movaps -8-32(%rax),%xmm8 + movaps -8-16(%rax),%xmm9 + movq %rax,%rsp +.Lepilogue_shaext: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_sha1_block_data_order_shaext: .def sha1_block_data_order_ssse3; .scl 3; .type 32; .endef .p2align 4 sha1_block_data_order_ssse3: @@ -2489,6 +2680,7 @@ K_XX_XX: .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 @@ -2534,6 +2726,37 @@ se_handler: jmp .Lcommon_seh_tail +.def shaext_handler; .scl 3; .type 32; .endef +.p2align 4 +shaext_handler: + pushq %rsi + pushq %rdi + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushfq + subq $64,%rsp + + movq 120(%r8),%rax + movq 248(%r8),%rbx + + leaq .Lprologue_shaext(%rip),%r10 + cmpq %r10,%rbx + jb .Lcommon_seh_tail + + leaq .Lepilogue_shaext(%rip),%r10 + cmpq %r10,%rbx + jae .Lcommon_seh_tail + + leaq -8-64(%rax),%rsi + leaq 512(%r8),%rdi + movl $8,%ecx +.long 0xa548f3fc + + jmp .Lcommon_seh_tail .def ssse3_handler; .scl 3; .type 32; .endef .p2align 4 @@ -2630,6 +2853,9 @@ ssse3_handler: .rva .LSEH_begin_sha1_block_data_order .rva .LSEH_end_sha1_block_data_order .rva .LSEH_info_sha1_block_data_order +.rva .LSEH_begin_sha1_block_data_order_shaext +.rva .LSEH_end_sha1_block_data_order_shaext +.rva .LSEH_info_sha1_block_data_order_shaext .rva .LSEH_begin_sha1_block_data_order_ssse3 .rva .LSEH_end_sha1_block_data_order_ssse3 .rva .LSEH_info_sha1_block_data_order_ssse3 @@ -2638,6 +2864,9 @@ ssse3_handler: .LSEH_info_sha1_block_data_order: .byte 9,0,0,0 .rva se_handler +.LSEH_info_sha1_block_data_order_shaext: +.byte 9,0,0,0 +.rva shaext_handler .LSEH_info_sha1_block_data_order_ssse3: .byte 9,0,0,0 .rva ssse3_handler diff --git a/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s index edaa67b95f..959a2525bf 100644 --- a/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s +++ b/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s @@ -56,6 +56,8 @@ sha256_block_data_order: movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d + testl $536870912,%r11d + jnz _shaext_shortcut testl $512,%r10d jnz .Lssse3_shortcut pushq %rbx @@ -1792,6 +1794,237 @@ K256: .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.def sha256_block_data_order_shaext; .scl 3; .type 32; .endef +.p2align 6 +sha256_block_data_order_shaext: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_sha256_block_data_order_shaext: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + +_shaext_shortcut: + leaq -88(%rsp),%rsp + movaps %xmm6,-8-80(%rax) + movaps %xmm7,-8-64(%rax) + movaps %xmm8,-8-48(%rax) + movaps %xmm9,-8-32(%rax) + movaps %xmm10,-8-16(%rax) +.Lprologue_shaext: + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 512-128(%rcx),%xmm7 + + pshufd $27,%xmm1,%xmm0 + pshufd $177,%xmm1,%xmm1 + pshufd $27,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.p2align 4 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 64-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 96-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 224-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 256-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 288-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 320-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 352-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 384-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 416-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 448-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 480-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $177,%xmm2,%xmm2 + pshufd $27,%xmm1,%xmm7 + pshufd $177,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + movaps -8-80(%rax),%xmm6 + movaps -8-64(%rax),%xmm7 + movaps -8-48(%rax),%xmm8 + movaps -8-32(%rax),%xmm9 + movaps -8-16(%rax),%xmm10 + movq %rax,%rsp +.Lepilogue_shaext: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_sha256_block_data_order_shaext: .def sha256_block_data_order_ssse3; .scl 3; .type 32; .endef .p2align 6 sha256_block_data_order_ssse3: @@ -1842,13 +2075,13 @@ sha256_block_data_order_ssse3: movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 .byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 leaq K256(%rip),%rbp .byte 102,15,56,0,207 movdqa 0(%rbp),%xmm4 -.byte 102,15,56,0,215 movdqa 32(%rbp),%xmm5 +.byte 102,15,56,0,215 paddd %xmm0,%xmm4 movdqa 64(%rbp),%xmm6 .byte 102,15,56,0,223 @@ -3002,12 +3235,46 @@ se_handler: popq %rsi .byte 0xf3,0xc3 +.def shaext_handler; .scl 3; .type 32; .endef +.p2align 4 +shaext_handler: + pushq %rsi + pushq %rdi + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushfq + subq $64,%rsp + + movq 120(%r8),%rax + movq 248(%r8),%rbx + + leaq .Lprologue_shaext(%rip),%r10 + cmpq %r10,%rbx + jb .Lin_prologue + + leaq .Lepilogue_shaext(%rip),%r10 + cmpq %r10,%rbx + jae .Lin_prologue + + leaq -8-80(%rax),%rsi + leaq 512(%r8),%rdi + movl $10,%ecx +.long 0xa548f3fc + + jmp .Lin_prologue .section .pdata .p2align 2 .rva .LSEH_begin_sha256_block_data_order .rva .LSEH_end_sha256_block_data_order .rva .LSEH_info_sha256_block_data_order +.rva .LSEH_begin_sha256_block_data_order_shaext +.rva .LSEH_end_sha256_block_data_order_shaext +.rva .LSEH_info_sha256_block_data_order_shaext .rva .LSEH_begin_sha256_block_data_order_ssse3 .rva .LSEH_end_sha256_block_data_order_ssse3 .rva .LSEH_info_sha256_block_data_order_ssse3 @@ -3017,6 +3284,9 @@ se_handler: .byte 9,0,0,0 .rva se_handler .rva .Lprologue,.Lepilogue +.LSEH_info_sha256_block_data_order_shaext: +.byte 9,0,0,0 +.rva shaext_handler .LSEH_info_sha256_block_data_order_ssse3: .byte 9,0,0,0 .rva se_handler diff --git a/lib/accelerated/x86/elf/ghash-x86_64.s b/lib/accelerated/x86/elf/ghash-x86_64.s index 17f6bebe6e..afa6714a6e 100644 --- a/lib/accelerated/x86/elf/ghash-x86_64.s +++ b/lib/accelerated/x86/elf/ghash-x86_64.s @@ -949,8 +949,8 @@ gcm_ghash_clmul: pxor %xmm11,%xmm12 .byte 102,68,15,58,68,222,0 .byte 102,68,15,58,68,238,17 - xorps %xmm11,%xmm3 .byte 102,68,15,58,68,231,16 + xorps %xmm11,%xmm3 xorps %xmm13,%xmm5 movups 80(%rsi),%xmm7 xorps %xmm12,%xmm4 @@ -968,8 +968,8 @@ gcm_ghash_clmul: pshufd $78,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,69,15,58,68,238,17 - xorps %xmm11,%xmm3 .byte 102,68,15,58,68,231,0 + xorps %xmm11,%xmm3 xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx @@ -987,23 +987,23 @@ gcm_ghash_clmul: xorps %xmm3,%xmm0 movdqu 32(%rdx),%xmm3 movdqa %xmm11,%xmm13 - pshufd $78,%xmm11,%xmm12 .byte 102,68,15,58,68,199,16 + pshufd $78,%xmm11,%xmm12 xorps %xmm5,%xmm1 pxor %xmm11,%xmm12 .byte 102,65,15,56,0,218 movups 32(%rsi),%xmm7 -.byte 102,68,15,58,68,218,0 xorps %xmm4,%xmm8 - movdqa %xmm3,%xmm5 +.byte 102,68,15,58,68,218,0 pshufd $78,%xmm3,%xmm4 pxor %xmm0,%xmm8 - pxor %xmm3,%xmm4 + movdqa %xmm3,%xmm5 pxor %xmm1,%xmm8 + pxor %xmm3,%xmm4 movdqa %xmm8,%xmm9 - pslldq $8,%xmm8 .byte 102,68,15,58,68,234,17 + pslldq $8,%xmm8 psrldq $8,%xmm9 pxor %xmm8,%xmm0 movdqa .L7_mask(%rip),%xmm8 @@ -1012,8 +1012,8 @@ gcm_ghash_clmul: pand %xmm0,%xmm8 .byte 102,69,15,56,0,200 -.byte 102,68,15,58,68,231,0 pxor %xmm0,%xmm9 +.byte 102,68,15,58,68,231,0 psllq $57,%xmm9 movdqa %xmm9,%xmm8 pslldq $8,%xmm9 @@ -1040,32 +1040,31 @@ gcm_ghash_clmul: movdqa %xmm11,%xmm13 pxor %xmm12,%xmm4 pshufd $78,%xmm11,%xmm12 - pxor %xmm11,%xmm12 -.byte 102,69,15,58,68,222,0 pxor %xmm9,%xmm0 pxor %xmm8,%xmm1 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm1 .byte 102,69,15,58,68,238,17 xorps %xmm11,%xmm3 - pxor %xmm1,%xmm0 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 .byte 102,68,15,58,68,231,0 xorps %xmm13,%xmm5 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm8 - pxor %xmm0,%xmm8 - leaq 64(%rdx),%rdx subq $64,%rcx jnc .Lmod4_loop .Ltail4x: .byte 102,65,15,58,68,199,0 - xorps %xmm12,%xmm4 .byte 102,65,15,58,68,207,17 - xorps %xmm3,%xmm0 .byte 102,68,15,58,68,199,16 + xorps %xmm12,%xmm4 + xorps %xmm3,%xmm0 xorps %xmm5,%xmm1 pxor %xmm0,%xmm1 pxor %xmm4,%xmm8 @@ -1145,13 +1144,13 @@ gcm_ghash_clmul: pxor %xmm3,%xmm0 pxor %xmm5,%xmm1 - movdqu (%rdx),%xmm5 + movdqu (%rdx),%xmm9 pxor %xmm0,%xmm8 -.byte 102,65,15,56,0,234 +.byte 102,69,15,56,0,202 movdqu 16(%rdx),%xmm3 pxor %xmm1,%xmm8 - pxor %xmm5,%xmm1 + pxor %xmm9,%xmm1 pxor %xmm8,%xmm4 .byte 102,65,15,56,0,218 movdqa %xmm4,%xmm8 @@ -1178,9 +1177,9 @@ gcm_ghash_clmul: pxor %xmm8,%xmm1 pxor %xmm5,%xmm4 -.byte 102,15,58,68,234,17 movdqa %xmm0,%xmm9 psrlq $1,%xmm0 +.byte 102,15,58,68,234,17 pxor %xmm9,%xmm1 pxor %xmm0,%xmm9 psrlq $5,%xmm0 @@ -1189,7 +1188,6 @@ gcm_ghash_clmul: psrlq $1,%xmm0 .byte 102,15,58,68,231,0 pxor %xmm1,%xmm0 -.byte 0x66,0x90 subq $32,%rcx ja .Lmod_loop diff --git a/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s b/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s index 116efd0c18..bed632617f 100644 --- a/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s +++ b/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s @@ -49,6 +49,8 @@ sha1_block_data_order: movl _gnutls_x86_cpuid_s+8(%rip),%r10d testl $512,%r8d jz .Lialu + testl $536870912,%r10d + jnz _shaext_shortcut jmp _ssse3_shortcut .align 16 @@ -1268,6 +1270,173 @@ sha1_block_data_order: .Lepilogue: .byte 0xf3,0xc3 .size sha1_block_data_order,.-sha1_block_data_order +.type sha1_block_data_order_shaext,@function +.align 32 +sha1_block_data_order_shaext: +_shaext_shortcut: + movdqu (%rdi),%xmm0 + movd 16(%rdi),%xmm1 + movdqa K_XX_XX+160(%rip),%xmm3 + + movdqu (%rsi),%xmm4 + pshufd $27,%xmm0,%xmm0 + movdqu 16(%rsi),%xmm5 + pshufd $27,%xmm1,%xmm1 + movdqu 32(%rsi),%xmm6 +.byte 102,15,56,0,227 + movdqu 48(%rsi),%xmm7 +.byte 102,15,56,0,235 +.byte 102,15,56,0,243 + movdqa %xmm1,%xmm9 +.byte 102,15,56,0,251 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + decq %rdx + leaq 64(%rsi),%rax + paddd %xmm4,%xmm1 + cmovneq %rax,%rsi + movdqa %xmm0,%xmm8 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 + movdqu (%rsi),%xmm4 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,213 + movdqu 16(%rsi),%xmm5 +.byte 102,15,56,0,227 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,206 + movdqu 32(%rsi),%xmm6 +.byte 102,15,56,0,235 + + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,215 + movdqu 48(%rsi),%xmm7 +.byte 102,15,56,0,243 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 65,15,56,200,201 +.byte 102,15,56,0,251 + + paddd %xmm8,%xmm0 + movdqa %xmm1,%xmm9 + + jnz .Loop_shaext + + pshufd $27,%xmm0,%xmm0 + pshufd $27,%xmm1,%xmm1 + movdqu %xmm0,(%rdi) + movd %xmm1,16(%rdi) + .byte 0xf3,0xc3 +.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext .type sha1_block_data_order_ssse3,@function .align 16 sha1_block_data_order_ssse3: @@ -2456,6 +2625,7 @@ K_XX_XX: .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 diff --git a/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s b/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s index f85e0bb6d8..e1f92d4a78 100644 --- a/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s +++ b/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s @@ -48,6 +48,8 @@ sha256_block_data_order: movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d + testl $536870912,%r11d + jnz _shaext_shortcut testl $512,%r10d jnz .Lssse3_shortcut pushq %rbx @@ -1782,6 +1784,213 @@ K256: .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.type sha256_block_data_order_shaext,@function +.align 64 +sha256_block_data_order_shaext: +_shaext_shortcut: + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 512-128(%rcx),%xmm7 + + pshufd $27,%xmm1,%xmm0 + pshufd $177,%xmm1,%xmm1 + pshufd $27,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 64-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 96-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 224-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 256-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 288-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 320-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 352-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 384-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 416-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 448-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 480-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $177,%xmm2,%xmm2 + pshufd $27,%xmm1,%xmm7 + pshufd $177,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + .byte 0xf3,0xc3 +.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext .type sha256_block_data_order_ssse3,@function .align 64 sha256_block_data_order_ssse3: @@ -1820,13 +2029,13 @@ sha256_block_data_order_ssse3: movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 .byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 leaq K256(%rip),%rbp .byte 102,15,56,0,207 movdqa 0(%rbp),%xmm4 -.byte 102,15,56,0,215 movdqa 32(%rbp),%xmm5 +.byte 102,15,56,0,215 paddd %xmm0,%xmm4 movdqa 64(%rbp),%xmm6 .byte 102,15,56,0,223 diff --git a/lib/accelerated/x86/macosx/ghash-x86_64.s b/lib/accelerated/x86/macosx/ghash-x86_64.s index a63034a6fc..9e246794f0 100644 --- a/lib/accelerated/x86/macosx/ghash-x86_64.s +++ b/lib/accelerated/x86/macosx/ghash-x86_64.s @@ -949,8 +949,8 @@ L$_ghash_clmul: pxor %xmm11,%xmm12 .byte 102,68,15,58,68,222,0 .byte 102,68,15,58,68,238,17 - xorps %xmm11,%xmm3 .byte 102,68,15,58,68,231,16 + xorps %xmm11,%xmm3 xorps %xmm13,%xmm5 movups 80(%rsi),%xmm7 xorps %xmm12,%xmm4 @@ -968,8 +968,8 @@ L$_ghash_clmul: pshufd $78,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,69,15,58,68,238,17 - xorps %xmm11,%xmm3 .byte 102,68,15,58,68,231,0 + xorps %xmm11,%xmm3 xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx @@ -987,23 +987,23 @@ L$mod4_loop: xorps %xmm3,%xmm0 movdqu 32(%rdx),%xmm3 movdqa %xmm11,%xmm13 - pshufd $78,%xmm11,%xmm12 .byte 102,68,15,58,68,199,16 + pshufd $78,%xmm11,%xmm12 xorps %xmm5,%xmm1 pxor %xmm11,%xmm12 .byte 102,65,15,56,0,218 movups 32(%rsi),%xmm7 -.byte 102,68,15,58,68,218,0 xorps %xmm4,%xmm8 - movdqa %xmm3,%xmm5 +.byte 102,68,15,58,68,218,0 pshufd $78,%xmm3,%xmm4 pxor %xmm0,%xmm8 - pxor %xmm3,%xmm4 + movdqa %xmm3,%xmm5 pxor %xmm1,%xmm8 + pxor %xmm3,%xmm4 movdqa %xmm8,%xmm9 - pslldq $8,%xmm8 .byte 102,68,15,58,68,234,17 + pslldq $8,%xmm8 psrldq $8,%xmm9 pxor %xmm8,%xmm0 movdqa L$7_mask(%rip),%xmm8 @@ -1012,8 +1012,8 @@ L$mod4_loop: pand %xmm0,%xmm8 .byte 102,69,15,56,0,200 -.byte 102,68,15,58,68,231,0 pxor %xmm0,%xmm9 +.byte 102,68,15,58,68,231,0 psllq $57,%xmm9 movdqa %xmm9,%xmm8 pslldq $8,%xmm9 @@ -1040,32 +1040,31 @@ L$mod4_loop: movdqa %xmm11,%xmm13 pxor %xmm12,%xmm4 pshufd $78,%xmm11,%xmm12 - pxor %xmm11,%xmm12 -.byte 102,69,15,58,68,222,0 pxor %xmm9,%xmm0 pxor %xmm8,%xmm1 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm1 .byte 102,69,15,58,68,238,17 xorps %xmm11,%xmm3 - pxor %xmm1,%xmm0 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 .byte 102,68,15,58,68,231,0 xorps %xmm13,%xmm5 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm8 - pxor %xmm0,%xmm8 - leaq 64(%rdx),%rdx subq $64,%rcx jnc L$mod4_loop L$tail4x: .byte 102,65,15,58,68,199,0 - xorps %xmm12,%xmm4 .byte 102,65,15,58,68,207,17 - xorps %xmm3,%xmm0 .byte 102,68,15,58,68,199,16 + xorps %xmm12,%xmm4 + xorps %xmm3,%xmm0 xorps %xmm5,%xmm1 pxor %xmm0,%xmm1 pxor %xmm4,%xmm8 @@ -1145,13 +1144,13 @@ L$mod_loop: pxor %xmm3,%xmm0 pxor %xmm5,%xmm1 - movdqu (%rdx),%xmm5 + movdqu (%rdx),%xmm9 pxor %xmm0,%xmm8 -.byte 102,65,15,56,0,234 +.byte 102,69,15,56,0,202 movdqu 16(%rdx),%xmm3 pxor %xmm1,%xmm8 - pxor %xmm5,%xmm1 + pxor %xmm9,%xmm1 pxor %xmm8,%xmm4 .byte 102,65,15,56,0,218 movdqa %xmm4,%xmm8 @@ -1178,9 +1177,9 @@ L$mod_loop: pxor %xmm8,%xmm1 pxor %xmm5,%xmm4 -.byte 102,15,58,68,234,17 movdqa %xmm0,%xmm9 psrlq $1,%xmm0 +.byte 102,15,58,68,234,17 pxor %xmm9,%xmm1 pxor %xmm0,%xmm9 psrlq $5,%xmm0 @@ -1189,7 +1188,6 @@ L$mod_loop: psrlq $1,%xmm0 .byte 102,15,58,68,231,0 pxor %xmm1,%xmm0 -.byte 0x66,0x90 subq $32,%rcx ja L$mod_loop diff --git a/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s b/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s index 9091bd802e..6e63270971 100644 --- a/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s +++ b/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s @@ -49,6 +49,8 @@ _sha1_block_data_order: movl __gnutls_x86_cpuid_s+8(%rip),%r10d testl $512,%r8d jz L$ialu + testl $536870912,%r10d + jnz _shaext_shortcut jmp _ssse3_shortcut .p2align 4 @@ -1269,6 +1271,173 @@ L$epilogue: .byte 0xf3,0xc3 +.p2align 5 +sha1_block_data_order_shaext: +_shaext_shortcut: + movdqu (%rdi),%xmm0 + movd 16(%rdi),%xmm1 + movdqa K_XX_XX+160(%rip),%xmm3 + + movdqu (%rsi),%xmm4 + pshufd $27,%xmm0,%xmm0 + movdqu 16(%rsi),%xmm5 + pshufd $27,%xmm1,%xmm1 + movdqu 32(%rsi),%xmm6 +.byte 102,15,56,0,227 + movdqu 48(%rsi),%xmm7 +.byte 102,15,56,0,235 +.byte 102,15,56,0,243 + movdqa %xmm1,%xmm9 +.byte 102,15,56,0,251 + jmp L$oop_shaext + +.p2align 4 +L$oop_shaext: + decq %rdx + leaq 64(%rsi),%rax + paddd %xmm4,%xmm1 + cmovneq %rax,%rsi + movdqa %xmm0,%xmm8 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 + movdqu (%rsi),%xmm4 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,213 + movdqu 16(%rsi),%xmm5 +.byte 102,15,56,0,227 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,206 + movdqu 32(%rsi),%xmm6 +.byte 102,15,56,0,235 + + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,215 + movdqu 48(%rsi),%xmm7 +.byte 102,15,56,0,243 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 65,15,56,200,201 +.byte 102,15,56,0,251 + + paddd %xmm8,%xmm0 + movdqa %xmm1,%xmm9 + + jnz L$oop_shaext + + pshufd $27,%xmm0,%xmm0 + pshufd $27,%xmm1,%xmm1 + movdqu %xmm0,(%rdi) + movd %xmm1,16(%rdi) + .byte 0xf3,0xc3 + + .p2align 4 sha1_block_data_order_ssse3: _ssse3_shortcut: @@ -2456,6 +2625,7 @@ K_XX_XX: .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 diff --git a/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s b/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s index c48240f457..94ea73c417 100644 --- a/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s +++ b/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s @@ -48,6 +48,8 @@ _sha256_block_data_order: movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d + testl $536870912,%r11d + jnz _shaext_shortcut testl $512,%r10d jnz L$ssse3_shortcut pushq %rbx @@ -1783,6 +1785,213 @@ K256: .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.p2align 6 +sha256_block_data_order_shaext: +_shaext_shortcut: + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 512-128(%rcx),%xmm7 + + pshufd $27,%xmm1,%xmm0 + pshufd $177,%xmm1,%xmm1 + pshufd $27,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp L$oop_shaext + +.p2align 4 +L$oop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 64-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 96-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 224-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 256-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 288-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 320-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 352-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 384-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 416-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 448-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 480-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz L$oop_shaext + + pshufd $177,%xmm2,%xmm2 + pshufd $27,%xmm1,%xmm7 + pshufd $177,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + .byte 0xf3,0xc3 + + .p2align 6 sha256_block_data_order_ssse3: L$ssse3_shortcut: @@ -1820,13 +2029,13 @@ L$loop_ssse3: movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 .byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 leaq K256(%rip),%rbp .byte 102,15,56,0,207 movdqa 0(%rbp),%xmm4 -.byte 102,15,56,0,215 movdqa 32(%rbp),%xmm5 +.byte 102,15,56,0,215 paddd %xmm0,%xmm4 movdqa 64(%rbp),%xmm6 .byte 102,15,56,0,223